diff --git a/.github/workflows/agent-evaluation.yml b/.github/workflows/agent-evaluation.yml
new file mode 100644
index 000000000..34b554cbc
--- /dev/null
+++ b/.github/workflows/agent-evaluation.yml
@@ -0,0 +1,190 @@
+name: Agent Evaluation
+
+on:
+  # Run on PR when agent/evaluation code changes
+  pull_request:
+    paths:
+      - 'agentic_ai/agents/**'
+      - 'agentic_ai/evaluations/**'
+  
+  # Allow manual trigger
+  workflow_dispatch:
+    inputs:
+      environment:
+        description: Target environment
+        type: choice
+        options: [dev, integration]
+        default: dev
+      agent_name:
+        description: 'Agent name for evaluation tracking'
+        type: string
+        default: 'ci-agent'
+      limit:
+        description: 'Limit number of test cases (0 = all)'
+        type: number
+        default: 5
+      eval_type:
+        description: 'Evaluation type'
+        type: choice
+        options: [all, single-turn-only, multi-turn-only]
+        default: all
+      push_to_foundry:
+        description: 'Push results to Azure AI Foundry'
+        type: boolean
+        default: false
+  
+  # Callable from other workflows
+  workflow_call:
+    inputs:
+      environment:
+        type: string
+        required: false
+        default: 'dev'
+      backend_endpoint:
+        type: string
+        required: true
+        description: 'Backend API endpoint URL'
+      mcp_endpoint:
+        type: string
+        required: true
+        description: 'MCP service endpoint URL'
+      agent_name:
+        type: string
+        required: false
+        default: 'ci-agent'
+      limit:
+        type: number
+        required: false
+        default: 0
+      push_to_foundry:
+        type: boolean
+        required: false
+        default: false
+
+env:
+  PYTHON_VERSION: '3.12'
+
+jobs:
+  # ============================================================================
+  # Evaluation - Run agent evaluation against test scenarios
+  # ============================================================================
+  evaluate:
+    name: Agent Evaluation
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      id-token: write  # For OIDC authentication
+    
+    environment: ${{ inputs.environment || 'dev' }}
+    
+    steps:
+      - uses: actions/checkout@v4
+      
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+          cache: 'pip'
+      
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          echo "$HOME/.cargo/bin" >> $GITHUB_PATH
+      
+      - name: Install dependencies
+        run: |
+          cd agentic_ai/applications
+          uv sync
+      
+      - name: Azure Login (OIDC)
+        if: ${{ inputs.push_to_foundry == true }}
+        uses: azure/login@v2
+        with:
+          client-id: ${{ vars.AZURE_CLIENT_ID }}
+          tenant-id: ${{ vars.AZURE_TENANT_ID }}
+          subscription-id: ${{ vars.AZURE_SUBSCRIPTION_ID }}
+      
+      - name: Get Azure credentials from Key Vault
+        if: ${{ inputs.push_to_foundry == true }}
+        run: |
+          KEYVAULT_NAME="${{ vars.KEYVAULT_NAME }}"
+          
+          if [ -n "$KEYVAULT_NAME" ]; then
+            AOAI_KEY=$(az keyvault secret show --vault-name "$KEYVAULT_NAME" --name "aoai-key" --query value -o tsv 2>/dev/null || echo "")
+            echo "::add-mask::$AOAI_KEY"
+            echo "AZURE_OPENAI_API_KEY=$AOAI_KEY" >> $GITHUB_ENV
+            
+            AI_PROJECT_ENDPOINT=$(az keyvault secret show --vault-name "$KEYVAULT_NAME" --name "ai-project-endpoint" --query value -o tsv 2>/dev/null || echo "")
+            echo "AZURE_AI_PROJECT_ENDPOINT=$AI_PROJECT_ENDPOINT" >> $GITHUB_ENV
+          fi
+      
+      - name: Run Agent Evaluation
+        run: |
+          cd agentic_ai/applications
+          
+          # Build command
+          CMD="uv run python ../evaluations/run_agent_eval.py"
+          CMD="$CMD --agent ${{ inputs.agent_name || 'ci-agent' }}"
+          CMD="$CMD --backend-url ${{ inputs.backend_endpoint || 'http://localhost:7000' }}"
+          
+          # Add limit if specified
+          if [ "${{ inputs.limit }}" != "0" ] && [ -n "${{ inputs.limit }}" ]; then
+            CMD="$CMD --limit ${{ inputs.limit }}"
+          fi
+          
+          # Add eval type filter
+          if [ "${{ inputs.eval_type }}" == "single-turn-only" ]; then
+            CMD="$CMD --single-turn-only"
+          elif [ "${{ inputs.eval_type }}" == "multi-turn-only" ]; then
+            CMD="$CMD --multi-turn-only"
+          fi
+          
+          # Add remote flag if pushing to Foundry
+          if [ "${{ inputs.push_to_foundry }}" == "true" ]; then
+            CMD="$CMD --remote"
+          else
+            CMD="$CMD --local"
+          fi
+          
+          echo "Running: $CMD"
+          $CMD
+        env:
+          AZURE_OPENAI_ENDPOINT: ${{ vars.AZURE_OPENAI_ENDPOINT }}
+          AZURE_OPENAI_CHAT_DEPLOYMENT: ${{ vars.AZURE_OPENAI_DEPLOYMENT }}
+          AZURE_OPENAI_API_VERSION: '2025-03-01-preview'
+          MCP_SERVER_URI: ${{ inputs.mcp_endpoint || 'http://localhost:8000/mcp' }}
+      
+      - name: Upload evaluation results
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: evaluation-results
+          path: |
+            agentic_ai/evaluations/eval_results/
+            agentic_ai/evaluations/evaluation_input_data.jsonl
+          retention-days: 30
+      
+      - name: Generate Summary
+        if: always()
+        run: |
+          echo "## 📊 Agent Evaluation Results" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "| Setting | Value |" >> $GITHUB_STEP_SUMMARY
+          echo "|---------|-------|" >> $GITHUB_STEP_SUMMARY
+          echo "| Agent | ${{ inputs.agent_name || 'ci-agent' }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| Environment | ${{ inputs.environment || 'dev' }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| Eval Type | ${{ inputs.eval_type || 'all' }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| Test Limit | ${{ inputs.limit || 'all' }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| Push to Foundry | ${{ inputs.push_to_foundry || 'false' }} |" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "### Metrics Evaluated" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "**Single-Turn (tool-focused):**" >> $GITHUB_STEP_SUMMARY
+          echo "- Tool behavior (recall, precision, efficiency)" >> $GITHUB_STEP_SUMMARY
+          echo "- Completeness, response quality, grounded accuracy" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "**Multi-Turn (outcome-focused):**" >> $GITHUB_STEP_SUMMARY
+          echo "- Solution accuracy, task adherence, intent resolution" >> $GITHUB_STEP_SUMMARY
+          echo "- Coherence, fluency, relevance" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "📁 See artifacts for detailed results" >> $GITHUB_STEP_SUMMARY
diff --git a/agentic_ai/agents/agent_framework/multi_agent/INTEGRATION_GUIDE.md b/agentic_ai/agents/agent_framework/multi_agent/INTEGRATION_GUIDE.md
deleted file mode 100644
index ff01f20b1..000000000
--- a/agentic_ai/agents/agent_framework/multi_agent/INTEGRATION_GUIDE.md
+++ /dev/null
@@ -1,634 +0,0 @@
-# Integration Guide: Workflow Reflection Agent
-
-This guide shows how to integrate the workflow-based reflection agent into your existing application.
-
-## Quick Start
-
-### 1. Import the Agent
-
-```python
-from agentic_ai.agents.agent_framework.multi_agent.reflection_workflow_agent import Agent
-```
-
-### 2. Basic Integration
-
-Replace your existing reflection agent import:
-
-```python
-# OLD: Traditional reflection agent
-# from agentic_ai.agents.agent_framework.multi_agent.reflection_agent import Agent
-
-# NEW: Workflow-based reflection agent
-from agentic_ai.agents.agent_framework.multi_agent.reflection_workflow_agent import Agent
-```
-
-### 3. Use Same Interface
-
-The workflow agent implements the same `BaseAgent` interface:
-
-```python
-# Create agent instance
-state_store = {}
-session_id = "user_123"
-agent = Agent(state_store=state_store, session_id=session_id)
-
-# Optional: Set WebSocket manager for streaming
-agent.set_websocket_manager(ws_manager)
-
-# Chat with user
-response = await agent.chat_async("Help me with billing for customer 1")
-```
-
-## Backend Integration (FastAPI/Flask)
-
-### Example: FastAPI Backend
-
-```python
-from fastapi import FastAPI, WebSocket
-from typing import Dict, Any
-from agentic_ai.agents.agent_framework.multi_agent.reflection_workflow_agent import Agent
-
-app = FastAPI()
-
-# Global state store (in production, use Redis or database)
-state_store: Dict[str, Any] = {}
-
-@app.post("/chat")
-async def chat_endpoint(
-    session_id: str,
-    message: str,
-    use_workflow: bool = True  # Toggle between traditional and workflow
-):
-    """
-    Chat endpoint with workflow reflection agent.
-    """
-    
-    if use_workflow:
-        from agentic_ai.agents.agent_framework.multi_agent.reflection_workflow_agent import Agent
-    else:
-        from agentic_ai.agents.agent_framework.multi_agent.reflection_agent import Agent
-    
-    # Create agent
-    agent = Agent(state_store=state_store, session_id=session_id)
-    
-    # Process message
-    response = await agent.chat_async(message)
-    
-    return {
-        "session_id": session_id,
-        "response": response,
-        "agent_type": "workflow" if use_workflow else "traditional"
-    }
-
-
-@app.websocket("/ws/{session_id}")
-async def websocket_endpoint(websocket: WebSocket, session_id: str):
-    """
-    WebSocket endpoint for streaming support.
-    """
-    await websocket.accept()
-    
-    # Create WebSocket manager (simplified)
-    class WSManager:
-        async def broadcast(self, sid: str, message: dict):
-            if sid == session_id:
-                await websocket.send_json(message)
-    
-    ws_manager = WSManager()
-    
-    try:
-        while True:
-            # Receive message from client
-            data = await websocket.receive_json()
-            message = data.get("message", "")
-            
-            # Create agent with streaming support
-            agent = Agent(state_store=state_store, session_id=session_id)
-            agent.set_websocket_manager(ws_manager)
-            
-            # Process message (will stream updates via WebSocket)
-            response = await agent.chat_async(message)
-            
-            # Send final confirmation
-            await websocket.send_json({
-                "type": "complete",
-                "response": response
-            })
-            
-    except Exception as e:
-        print(f"WebSocket error: {e}")
-    finally:
-        await websocket.close()
-```
-
-## Frontend Integration
-
-### JavaScript/TypeScript Client
-
-```typescript
-interface ChatMessage {
-  role: 'user' | 'assistant';
-  content: string;
-}
-
-interface StreamEvent {
-  type: 'orchestrator' | 'agent_start' | 'agent_token' | 'agent_message' | 'final_result';
-  agent_id?: string;
-  content?: string;
-  kind?: 'plan' | 'progress' | 'result';
-}
-
-class WorkflowReflectionClient {
-  private ws: WebSocket;
-  private sessionId: string;
-  
-  constructor(sessionId: string) {
-    this.sessionId = sessionId;
-    this.ws = new WebSocket(`ws://localhost:8000/ws/${sessionId}`);
-    this.setupEventHandlers();
-  }
-  
-  private setupEventHandlers() {
-    this.ws.onmessage = (event) => {
-      const data: StreamEvent = JSON.parse(event.data);
-      this.handleStreamEvent(data);
-    };
-  }
-  
-  private handleStreamEvent(event: StreamEvent) {
-    switch (event.type) {
-      case 'orchestrator':
-        this.updateOrchestrator(event.kind!, event.content!);
-        break;
-        
-      case 'agent_start':
-        this.showAgentBadge(event.agent_id!);
-        break;
-        
-      case 'agent_token':
-        this.appendToken(event.agent_id!, event.content!);
-        break;
-        
-      case 'agent_message':
-        this.finalizeAgentMessage(event.agent_id!, event.content!);
-        break;
-        
-      case 'final_result':
-        this.displayFinalResponse(event.content!);
-        break;
-    }
-  }
-  
-  private updateOrchestrator(kind: string, content: string) {
-    const container = document.getElementById('orchestrator-status');
-    if (container) {
-      container.innerHTML = `
-        <div class="orchestrator-${kind}">
-          <strong>${kind.toUpperCase()}</strong>
-          <p>${content}</p>
-        </div>
-      `;
-    }
-  }
-  
-  private showAgentBadge(agentId: string) {
-    const badge = document.createElement('div');
-    badge.className = `agent-badge ${agentId}`;
-    badge.textContent = agentId.replace('_', ' ').toUpperCase();
-    document.getElementById('agent-container')?.appendChild(badge);
-  }
-  
-  private appendToken(agentId: string, token: string) {
-    const messageDiv = document.getElementById(`message-${agentId}`) 
-      || this.createMessageDiv(agentId);
-    messageDiv.textContent += token;
-  }
-  
-  private createMessageDiv(agentId: string): HTMLDivElement {
-    const div = document.createElement('div');
-    div.id = `message-${agentId}`;
-    div.className = 'agent-message streaming';
-    document.getElementById('messages-container')?.appendChild(div);
-    return div;
-  }
-  
-  private finalizeAgentMessage(agentId: string, content: string) {
-    const messageDiv = document.getElementById(`message-${agentId}`);
-    if (messageDiv) {
-      messageDiv.classList.remove('streaming');
-      messageDiv.classList.add('complete');
-    }
-  }
-  
-  private displayFinalResponse(content: string) {
-    const responseDiv = document.createElement('div');
-    responseDiv.className = 'final-response';
-    responseDiv.innerHTML = `
-      <div class="message assistant">
-        <strong>Assistant:</strong>
-        <p>${content}</p>
-      </div>
-    `;
-    document.getElementById('chat-container')?.appendChild(responseDiv);
-  }
-  
-  public sendMessage(message: string) {
-    this.ws.send(JSON.stringify({ message }));
-  }
-}
-
-// Usage
-const client = new WorkflowReflectionClient('user_session_123');
-client.sendMessage('What is the billing status for customer 1?');
-```
-
-### React Component
-
-```tsx
-import React, { useState, useEffect, useCallback } from 'react';
-
-interface StreamEvent {
-  type: string;
-  agent_id?: string;
-  content?: string;
-  kind?: string;
-}
-
-const WorkflowReflectionChat: React.FC<{ sessionId: string }> = ({ sessionId }) => {
-  const [messages, setMessages] = useState<Array<{ role: string; content: string }>>([]);
-  const [orchestratorStatus, setOrchestratorStatus] = useState<string>('');
-  const [activeAgents, setActiveAgents] = useState<Set<string>>(new Set());
-  const [ws, setWs] = useState<WebSocket | null>(null);
-  
-  useEffect(() => {
-    const websocket = new WebSocket(`ws://localhost:8000/ws/${sessionId}`);
-    
-    websocket.onmessage = (event) => {
-      const data: StreamEvent = JSON.parse(event.data);
-      handleStreamEvent(data);
-    };
-    
-    setWs(websocket);
-    
-    return () => {
-      websocket.close();
-    };
-  }, [sessionId]);
-  
-  const handleStreamEvent = (event: StreamEvent) => {
-    switch (event.type) {
-      case 'orchestrator':
-        setOrchestratorStatus(event.content || '');
-        break;
-        
-      case 'agent_start':
-        setActiveAgents(prev => new Set(prev).add(event.agent_id!));
-        break;
-        
-      case 'final_result':
-        setMessages(prev => [...prev, { role: 'assistant', content: event.content! }]);
-        setActiveAgents(new Set());
-        break;
-    }
-  };
-  
-  const sendMessage = useCallback((message: string) => {
-    if (ws && ws.readyState === WebSocket.OPEN) {
-      ws.send(JSON.stringify({ message }));
-      setMessages(prev => [...prev, { role: 'user', content: message }]);
-    }
-  }, [ws]);
-  
-  return (
-    <div className="workflow-reflection-chat">
-      <div className="orchestrator-status">
-        {orchestratorStatus && (
-          <div className="status-banner">
-            {orchestratorStatus}
-          </div>
-        )}
-      </div>
-      
-      <div className="active-agents">
-        {Array.from(activeAgents).map(agentId => (
-          <span key={agentId} className="agent-badge">
-            {agentId.replace('_', ' ')}
-          </span>
-        ))}
-      </div>
-      
-      <div className="messages">
-        {messages.map((msg, idx) => (
-          <div key={idx} className={`message ${msg.role}`}>
-            <strong>{msg.role}:</strong>
-            <p>{msg.content}</p>
-          </div>
-        ))}
-      </div>
-      
-      <ChatInput onSend={sendMessage} />
-    </div>
-  );
-};
-```
-
-## Streamlit Integration
-
-```python
-import streamlit as st
-import asyncio
-from agentic_ai.agents.agent_framework.multi_agent.reflection_workflow_agent import Agent
-
-# Initialize session state
-if 'state_store' not in st.session_state:
-    st.session_state.state_store = {}
-if 'session_id' not in st.session_state:
-    st.session_state.session_id = "streamlit_session"
-
-# Create agent
-@st.cache_resource
-def get_agent():
-    return Agent(
-        state_store=st.session_state.state_store,
-        session_id=st.session_state.session_id
-    )
-
-# UI
-st.title("Workflow Reflection Agent Chat")
-
-# Display chat history
-chat_history = st.session_state.state_store.get(
-    f"{st.session_state.session_id}_chat_history", []
-)
-
-for msg in chat_history:
-    with st.chat_message(msg["role"]):
-        st.write(msg["content"])
-
-# Chat input
-if prompt := st.chat_input("Ask me anything..."):
-    # Display user message
-    with st.chat_message("user"):
-        st.write(prompt)
-    
-    # Get agent response
-    agent = get_agent()
-    
-    # Show processing indicator
-    with st.spinner("Processing with workflow reflection..."):
-        response = asyncio.run(agent.chat_async(prompt))
-    
-    # Display assistant response
-    with st.chat_message("assistant"):
-        st.write(response)
-    
-    # Rerun to update chat history
-    st.rerun()
-```
-
-## Configuration Management
-
-### Environment Configuration
-
-Create a `.env` file:
-
-```bash
-# Azure OpenAI Configuration
-AZURE_OPENAI_API_KEY=your_api_key_here
-AZURE_OPENAI_CHAT_DEPLOYMENT=gpt-4
-AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/
-AZURE_OPENAI_API_VERSION=2024-02-15-preview
-OPENAI_MODEL_NAME=gpt-4
-
-# Optional: MCP Server
-MCP_SERVER_URI=http://localhost:5000/mcp
-```
-
-### Dynamic Agent Selection
-
-```python
-from typing import Literal
-from agentic_ai.agents.base_agent import BaseAgent
-
-AgentType = Literal["workflow", "traditional"]
-
-def create_agent(
-    agent_type: AgentType,
-    state_store: dict,
-    session_id: str,
-    **kwargs
-) -> BaseAgent:
-    """
-    Factory function to create the appropriate agent type.
-    """
-    if agent_type == "workflow":
-        from agentic_ai.agents.agent_framework.multi_agent.reflection_workflow_agent import Agent
-    elif agent_type == "traditional":
-        from agentic_ai.agents.agent_framework.multi_agent.reflection_agent import Agent
-    else:
-        raise ValueError(f"Unknown agent type: {agent_type}")
-    
-    return Agent(state_store=state_store, session_id=session_id, **kwargs)
-
-# Usage
-agent = create_agent(
-    agent_type="workflow",  # or "traditional"
-    state_store=state_store,
-    session_id=session_id,
-    access_token=access_token
-)
-```
-
-## Monitoring and Logging
-
-### Enhanced Logging
-
-```python
-import logging
-from agentic_ai.agents.agent_framework.multi_agent.reflection_workflow_agent import Agent
-
-# Configure detailed logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.FileHandler('workflow_agent.log'),
-        logging.StreamHandler()
-    ]
-)
-
-# Create agent
-agent = Agent(state_store=state_store, session_id=session_id)
-
-# Use agent (logs will capture all workflow steps)
-response = await agent.chat_async("Help me")
-```
-
-### Metrics Collection
-
-```python
-import time
-from dataclasses import dataclass
-from typing import List
-
-@dataclass
-class WorkflowMetrics:
-    session_id: str
-    request_id: str
-    start_time: float
-    end_time: float
-    refinement_count: int
-    approved: bool
-    
-    @property
-    def duration(self) -> float:
-        return self.end_time - self.start_time
-
-class MetricsCollector:
-    def __init__(self):
-        self.metrics: List[WorkflowMetrics] = []
-    
-    def track_request(self, session_id: str, request_id: str):
-        # Implementation for tracking metrics
-        pass
-    
-    def report(self):
-        total_requests = len(self.metrics)
-        avg_duration = sum(m.duration for m in self.metrics) / total_requests
-        avg_refinements = sum(m.refinement_count for m in self.metrics) / total_requests
-        
-        print(f"Total Requests: {total_requests}")
-        print(f"Average Duration: {avg_duration:.2f}s")
-        print(f"Average Refinements: {avg_refinements:.2f}")
-
-# Usage with agent
-metrics = MetricsCollector()
-# Integrate with agent workflow
-```
-
-## Testing
-
-### Unit Tests
-
-```python
-import pytest
-from agentic_ai.agents.agent_framework.multi_agent.reflection_workflow_agent import Agent
-
-@pytest.fixture
-def agent():
-    state_store = {}
-    return Agent(state_store=state_store, session_id="test_session")
-
-@pytest.mark.asyncio
-async def test_basic_chat(agent):
-    response = await agent.chat_async("What is 2+2?")
-    assert response is not None
-    assert len(response) > 0
-
-@pytest.mark.asyncio
-async def test_conversation_history(agent):
-    # First message
-    await agent.chat_async("My name is John")
-    
-    # Second message should have context
-    response = await agent.chat_async("What is my name?")
-    assert "john" in response.lower()
-
-@pytest.mark.asyncio
-async def test_mcp_tool_usage(agent):
-    # Assuming MCP is configured
-    response = await agent.chat_async("Get customer details for ID 1")
-    # Verify tool was used and response contains customer data
-    assert "customer" in response.lower()
-```
-
-### Integration Tests
-
-```python
-import pytest
-from fastapi.testclient import TestClient
-from your_backend import app
-
-@pytest.fixture
-def client():
-    return TestClient(app)
-
-def test_chat_endpoint(client):
-    response = client.post(
-        "/chat",
-        json={
-            "session_id": "test_123",
-            "message": "Hello",
-            "use_workflow": True
-        }
-    )
-    assert response.status_code == 200
-    data = response.json()
-    assert data["agent_type"] == "workflow"
-    assert "response" in data
-```
-
-## Best Practices
-
-1. **Session Management**: Use unique session IDs per user
-2. **State Persistence**: Store state in Redis/database for production
-3. **Error Handling**: Implement proper error boundaries
-4. **Rate Limiting**: Protect endpoints from abuse
-5. **Authentication**: Secure MCP endpoints with proper tokens
-6. **Monitoring**: Log all workflow events for debugging
-7. **Testing**: Write comprehensive tests for edge cases
-
-## Troubleshooting
-
-### Issue: Workflow hangs
-
-**Cause**: Missing message handlers or unconnected edges
-
-**Solution**: Verify WorkflowBuilder has all necessary edges:
-```python
-.add_edge(primary_agent, reviewer_agent)
-.add_edge(reviewer_agent, primary_agent)
-```
-
-### Issue: MCP tools not working
-
-**Cause**: MCP_SERVER_URI not set or server not running
-
-**Solution**: 
-```bash
-# Start MCP server
-python mcp/mcp_service.py
-
-# Set environment variable
-export MCP_SERVER_URI=http://localhost:5000/mcp
-```
-
-### Issue: Streaming not working
-
-**Cause**: WebSocket manager not set
-
-**Solution**:
-```python
-agent.set_websocket_manager(ws_manager)
-```
-
-## Migration Checklist
-
-- [ ] Update agent imports
-- [ ] Test basic chat functionality
-- [ ] Verify conversation history persistence
-- [ ] Test streaming with WebSocket
-- [ ] Validate MCP tool integration
-- [ ] Update frontend to handle new event types
-- [ ] Configure monitoring and logging
-- [ ] Run integration tests
-- [ ] Deploy to staging environment
-- [ ] Monitor performance metrics
-
-## Support
-
-For issues or questions:
-1. Check the [README](WORKFLOW_REFLECTION_README.md)
-2. Review [Architecture Diagrams](WORKFLOW_DIAGRAMS.md)
-3. Run tests: `python test_reflection_workflow_agent.py`
-4. Enable debug logging for detailed traces
diff --git a/agentic_ai/agents/agent_framework/multi_agent/PROJECT_SUMMARY.md b/agentic_ai/agents/agent_framework/multi_agent/PROJECT_SUMMARY.md
deleted file mode 100644
index 752e0f928..000000000
--- a/agentic_ai/agents/agent_framework/multi_agent/PROJECT_SUMMARY.md
+++ /dev/null
@@ -1,449 +0,0 @@
-# Workflow-Based Reflection Agent - Project Summary
-
-## What We Created
-
-A complete workflow-based implementation of the reflection agent pattern using Agent Framework's `WorkflowBuilder`, featuring a 3-party communication design with quality assurance gates.
-
-## Files Created
-
-### 1. **reflection_workflow_agent.py** (Main Implementation)
-Location: `agentic_ai/agents/agent_framework/multi_agent/reflection_workflow_agent.py`
-
-**Key Components:**
-- `PrimaryAgentExecutor`: Customer support agent with MCP tool support
-- `ReviewerAgentExecutor`: Quality assurance gate with conditional routing
-- `Agent`: Main class implementing `BaseAgent` interface
-
-**Features:**
-- ✅ 3-party communication pattern (User → Primary → Reviewer → User)
-- ✅ Conversation history management
-- ✅ MCP tool integration
-- ✅ Streaming support via WebSocket
-- ✅ Iterative refinement with feedback loops
-- ✅ Compatible with existing `BaseAgent` interface
-
-### 2. **test_reflection_workflow_agent.py** (Test Suite)
-Location: `agentic_ai/agents/agent_framework/multi_agent/test_reflection_workflow_agent.py`
-
-**Features:**
-- Environment variable validation
-- Basic chat functionality tests
-- MCP tool integration tests
-- Conversation history verification
-- User-friendly output with progress indicators
-
-**Usage:**
-```bash
-python agentic_ai/agents/agent_framework/multi_agent/test_reflection_workflow_agent.py
-```
-
-### 3. **WORKFLOW_REFLECTION_README.md** (Documentation)
-Location: `agentic_ai/agents/agent_framework/multi_agent/WORKFLOW_REFLECTION_README.md`
-
-**Contents:**
-- Architecture overview
-- 3-party communication pattern explanation
-- Implementation details
-- Usage examples
-- Environment configuration
-- Troubleshooting guide
-- Comparison with traditional approach
-- Best practices
-
-### 4. **WORKFLOW_DIAGRAMS.md** (Visual Documentation)
-Location: `agentic_ai/agents/agent_framework/multi_agent/WORKFLOW_DIAGRAMS.md`
-
-**Mermaid Diagrams:**
-- 3-party communication flow
-- Detailed workflow execution sequence
-- Message type relationships
-- Workflow graph structure
-- State management flow
-- Conversation history flow
-- Traditional vs Workflow comparison
-- MCP tool integration
-- Error handling flow
-- Streaming events flow
-
-### 5. **INTEGRATION_GUIDE.md** (Integration Documentation)
-Location: `agentic_ai/agents/agent_framework/multi_agent/INTEGRATION_GUIDE.md`
-
-**Contents:**
-- Quick start guide
-- Backend integration (FastAPI example)
-- Frontend integration (JavaScript/TypeScript, React)
-- Streamlit integration
-- Configuration management
-- Monitoring and logging
-- Testing strategies
-- Migration checklist
-
-## Architecture Highlights
-
-### 3-Party Communication Pattern
-
-```
-User → PrimaryAgent → ReviewerAgent → {approve: User, reject: PrimaryAgent}
-         ↑                                          |
-         |__________________________________________|
-                    (feedback loop)
-```
-
-**Key Principles:**
-1. PrimaryAgent receives user messages but cannot send directly to user
-2. All PrimaryAgent outputs go to ReviewerAgent
-3. ReviewerAgent acts as conditional gate (approve/reject)
-4. Conversation history maintained between User and PrimaryAgent only
-5. Both agents receive history for context
-
-### Workflow Graph
-
-```python
-workflow = (
-    WorkflowBuilder()
-    .add_edge(primary_agent, reviewer_agent)  # Forward path
-    .add_edge(reviewer_agent, primary_agent)  # Feedback path
-    .set_start_executor(primary_agent)
-    .build()
-    .as_agent()
-)
-```
-
-### Message Types
-
-1. **PrimaryAgentRequest**: User → PrimaryAgent
-   - `request_id`: Unique identifier
-   - `user_prompt`: User's question
-   - `conversation_history`: Previous messages
-
-2. **ReviewRequest**: PrimaryAgent → ReviewerAgent
-   - `request_id`: Same as original request
-   - `user_prompt`: Original question
-   - `conversation_history`: For context
-   - `primary_agent_response`: Agent's answer
-
-3. **ReviewResponse**: ReviewerAgent → PrimaryAgent
-   - `request_id`: Correlation ID
-   - `approved`: Boolean decision
-   - `feedback`: Constructive feedback or approval note
-
-## Key Features
-
-### ✅ Workflow-Based Architecture
-- Built using `WorkflowBuilder` for explicit control flow
-- Bidirectional edges between executors
-- Conditional routing based on structured decisions
-
-### ✅ Quality Assurance
-- Every response reviewed before reaching user
-- Structured evaluation criteria:
-  - Accuracy of information
-  - Completeness of answer
-  - Professional tone
-  - Proper tool usage
-  - Clarity and helpfulness
-
-### ✅ Iterative Refinement
-- Failed reviews trigger regeneration with feedback
-- Conversation context preserved across iterations
-- Unlimited refinement cycles until approval
-
-### ✅ MCP Tool Integration
-- Supports MCP tools for external data access
-- Tools available to both agents
-- Proper authentication via bearer tokens
-
-### ✅ Streaming Support
-- WebSocket-based streaming for real-time updates
-- Progress indicators for each workflow stage
-- Token-level streaming for agent responses
-
-### ✅ State Management
-- Conversation history persisted in state store
-- Session-based isolation
-- Compatible with Redis/database for production
-
-## Usage Examples
-
-### Basic Usage
-
-```python
-from agentic_ai.agents.agent_framework.multi_agent.reflection_workflow_agent import Agent
-
-# Create agent
-state_store = {}
-agent = Agent(state_store=state_store, session_id="user_123")
-
-# Chat
-response = await agent.chat_async("Help with customer 1")
-```
-
-### With Streaming
-
-```python
-# Set WebSocket manager
-agent.set_websocket_manager(ws_manager)
-
-# Chat with streaming updates
-response = await agent.chat_async("What promotions are available?")
-```
-
-### With MCP Tools
-
-```python
-# Set MCP_SERVER_URI environment variable
-os.environ["MCP_SERVER_URI"] = "http://localhost:5000/mcp"
-
-# Agent will automatically use MCP tools
-agent = Agent(state_store=state_store, session_id="user_123", access_token=token)
-response = await agent.chat_async("Get billing summary for customer 1")
-```
-
-## Comparison: Workflow vs Traditional
-
-| Feature | Traditional | Workflow |
-|---------|------------|----------|
-| **Architecture** | Sequential agent.run() calls | Message-based graph execution |
-| **Control Flow** | Implicit (procedural code) | Explicit (workflow edges) |
-| **State Management** | Manual (instance variables) | Framework-managed |
-| **Scalability** | Limited | Highly scalable |
-| **Testing** | Mock agent methods | Mock message handlers |
-| **Debugging** | Step through code | Trace message flow |
-| **Extensibility** | Modify agent code | Add executors/edges |
-
-## Integration Points
-
-### Backend Integration
-- ✅ FastAPI example provided
-- ✅ WebSocket support for streaming
-- ✅ Compatible with existing BaseAgent interface
-- ✅ No breaking changes to API
-
-### Frontend Integration
-- ✅ JavaScript/TypeScript client example
-- ✅ React component example
-- ✅ Stream event handlers
-- ✅ Progressive UI updates
-
-### Streamlit Integration
-- ✅ Complete Streamlit example
-- ✅ Session state management
-- ✅ Chat history display
-- ✅ Async execution handling
-
-## Testing
-
-### Run Tests
-
-```bash
-# Basic test
-python agentic_ai/agents/agent_framework/multi_agent/test_reflection_workflow_agent.py
-
-# With specific Python
-python3.11 agentic_ai/agents/agent_framework/multi_agent/test_reflection_workflow_agent.py
-```
-
-### Test Coverage
-- ✅ Environment validation
-- ✅ Basic chat functionality
-- ✅ Conversation history
-- ✅ MCP tool integration
-- ✅ Error handling
-
-## Environment Variables
-
-**Required:**
-- `AZURE_OPENAI_API_KEY`
-- `AZURE_OPENAI_CHAT_DEPLOYMENT`
-- `AZURE_OPENAI_ENDPOINT`
-- `AZURE_OPENAI_API_VERSION`
-- `OPENAI_MODEL_NAME`
-
-**Optional:**
-- `MCP_SERVER_URI` (enables MCP tool usage)
-
-## Documentation Structure
-
-```
-agentic_ai/agents/agent_framework/multi_agent/
-├── reflection_workflow_agent.py          # Main implementation
-├── test_reflection_workflow_agent.py      # Test suite
-├── WORKFLOW_REFLECTION_README.md          # Main documentation
-├── WORKFLOW_DIAGRAMS.md                   # Visual diagrams
-├── INTEGRATION_GUIDE.md                   # Integration examples
-└── PROJECT_SUMMARY.md                     # This file
-```
-
-## Key Learnings from Reference Examples
-
-### From `workflow_as_agent_reflection_pattern_azure.py`
-- ✅ WorkflowBuilder usage patterns
-- ✅ Message-based communication
-- ✅ AgentRunUpdateEvent for output emission
-- ✅ Structured output with Pydantic
-
-### From `workflow_as_agent_human_in_the_loop_azure.py`
-- ✅ RequestInfoExecutor pattern
-- ✅ Correlation with request IDs
-- ✅ Bidirectional edge configuration
-
-### From `edge_condition.py`
-- ✅ Conditional routing with predicates
-- ✅ Boolean edge conditions
-- ✅ Structured decision parsing
-
-### From `guessing_game_with_human_input.py`
-- ✅ Event-driven architecture
-- ✅ RequestResponse correlation
-- ✅ Typed request payloads
-
-## Advantages of Workflow Approach
-
-### 1. **Explicit Control Flow**
-Workflow edges make the communication pattern crystal clear:
-```python
-.add_edge(primary_agent, reviewer_agent)
-.add_edge(reviewer_agent, primary_agent)
-```
-
-### 2. **Better Separation of Concerns**
-Each executor has a single responsibility:
-- PrimaryAgent: Generate responses
-- ReviewerAgent: Evaluate quality
-
-### 3. **Framework-Managed State**
-No need to manually track pending requests across retries.
-
-### 4. **Easier Testing**
-Mock message handlers instead of complex agent interactions.
-
-### 5. **Scalability**
-Easy to add more executors (e.g., specialized reviewers, human escalation).
-
-### 6. **Debugging**
-Message flow is traceable through logs.
-
-## Future Enhancement Ideas
-
-### Short Term
-- [ ] Add max refinement limit to prevent infinite loops
-- [ ] Implement retry logic with exponential backoff
-- [ ] Add metrics collection for performance monitoring
-- [ ] Create Jupyter notebook examples
-
-### Medium Term
-- [ ] Support parallel reviewer agents (consensus-based approval)
-- [ ] Add human-in-the-loop escalation for edge cases
-- [ ] Implement A/B testing framework for review criteria
-- [ ] Create dashboard for workflow analytics
-
-### Long Term
-- [ ] Multi-modal support (images, files)
-- [ ] Fine-tuned reviewer models
-- [ ] Dynamic workflow routing based on request type
-- [ ] Integration with external approval systems
-
-## Migration from Traditional Agent
-
-### Step-by-Step Migration
-
-1. **Update Import**
-   ```python
-   # OLD
-   from agentic_ai.agents.agent_framework.multi_agent.reflection_agent import Agent
-   
-   # NEW
-   from agentic_ai.agents.agent_framework.multi_agent.reflection_workflow_agent import Agent
-   ```
-
-2. **No Code Changes Required**
-   The workflow agent implements the same `BaseAgent` interface.
-
-3. **Test Thoroughly**
-   Run integration tests to verify behavior.
-
-4. **Monitor Performance**
-   Compare response times and quality metrics.
-
-5. **Gradual Rollout**
-   Use feature flags to gradually migrate users.
-
-### Migration Checklist
-
-- [ ] Update agent imports
-- [ ] Test basic chat functionality
-- [ ] Verify conversation history
-- [ ] Test streaming with WebSocket
-- [ ] Validate MCP tool integration
-- [ ] Update frontend event handlers
-- [ ] Configure monitoring
-- [ ] Run integration tests
-- [ ] Deploy to staging
-- [ ] Monitor metrics
-- [ ] Full production rollout
-
-## Success Criteria
-
-### Functional Requirements
-- ✅ All responses reviewed before delivery
-- ✅ Conversation history maintained correctly
-- ✅ MCP tools work as expected
-- ✅ Streaming updates work properly
-- ✅ Compatible with existing interface
-
-### Non-Functional Requirements
-- ✅ Response time < 5 seconds (typical)
-- ✅ Clear logging for debugging
-- ✅ Proper error handling
-- ✅ Comprehensive documentation
-- ✅ Test coverage > 80%
-
-## Resources
-
-### Documentation
-- [Main README](WORKFLOW_REFLECTION_README.md)
-- [Architecture Diagrams](WORKFLOW_DIAGRAMS.md)
-- [Integration Guide](INTEGRATION_GUIDE.md)
-
-### Code
-- [Implementation](reflection_workflow_agent.py)
-- [Tests](test_reflection_workflow_agent.py)
-
-### References
-- [Agent Framework Reflection Example](../../../reference/agent-framework/python/samples/getting_started/workflows/agents/workflow_as_agent_reflection_pattern_azure.py)
-- [Human-in-the-Loop Example](../../../reference/agent-framework/python/samples/getting_started/workflows/agents/workflow_as_agent_human_in_the_loop_azure.py)
-- [Edge Conditions Example](../../../reference/agent-framework/python/samples/getting_started/workflows/control-flow/edge_condition.py)
-
-## Support and Feedback
-
-For issues, questions, or feedback:
-
-1. **Check Documentation**: Review README and integration guide
-2. **Run Tests**: Execute test suite to validate setup
-3. **Enable Debug Logging**: Set log level to DEBUG
-4. **Review Diagrams**: Check architecture diagrams for understanding
-5. **Create Issue**: Document issue with logs and reproduction steps
-
-## Conclusion
-
-The workflow-based reflection agent provides a robust, scalable, and maintainable implementation of the reflection pattern. It leverages Agent Framework's workflow capabilities to create an explicit, testable, and extensible architecture that's ready for production use.
-
-**Key Benefits:**
-- ✅ Explicit 3-party communication pattern
-- ✅ Quality-assured responses
-- ✅ Iterative refinement
-- ✅ Production-ready with streaming
-- ✅ Fully compatible with existing system
-- ✅ Comprehensive documentation
-
-**Ready to Use:**
-- All code tested and documented
-- Integration examples provided
-- Migration path clear
-- Support materials available
-
----
-
-**Version**: 1.0.0  
-**Date**: October 2025  
-**Status**: Production Ready ✅
diff --git a/agentic_ai/agents/agent_framework/multi_agent/QUICK_REFERENCE.md b/agentic_ai/agents/agent_framework/multi_agent/QUICK_REFERENCE.md
deleted file mode 100644
index 2f7e1a7d8..000000000
--- a/agentic_ai/agents/agent_framework/multi_agent/QUICK_REFERENCE.md
+++ /dev/null
@@ -1,351 +0,0 @@
-# Workflow Reflection Agent - Quick Reference
-
-## One-Minute Overview
-
-**What**: Workflow-based reflection agent with 3-party quality assurance pattern  
-**When**: Use for high-quality responses with built-in review process  
-**Why**: Better control flow, scalability, and maintainability vs traditional approach
-
-## Quick Start (30 seconds)
-
-```python
-from agentic_ai.agents.agent_framework.multi_agent.reflection_workflow_agent import Agent
-
-state_store = {}
-agent = Agent(state_store=state_store, session_id="user_123")
-response = await agent.chat_async("Your question here")
-```
-
-## Architecture at a Glance
-
-```
-User ─┬──> PrimaryAgent ─┬──> ReviewerAgent ─┬──> User (if approved)
-      │                   │                    │
-      └─ History ─────────┘                    └──> PrimaryAgent (if rejected)
-                                                           │
-                                                           └──> (loop)
-```
-
-## Key Files
-
-| File | Purpose | Size |
-|------|---------|------|
-| `reflection_workflow_agent.py` | Main implementation | ~600 lines |
-| `test_reflection_workflow_agent.py` | Test suite | ~200 lines |
-| `WORKFLOW_REFLECTION_README.md` | Full documentation | ~400 lines |
-| `WORKFLOW_DIAGRAMS.md` | Visual diagrams | ~500 lines |
-| `INTEGRATION_GUIDE.md` | Integration examples | ~800 lines |
-
-## Message Flow Cheat Sheet
-
-### 1️⃣ User → PrimaryAgent
-```python
-PrimaryAgentRequest(
-    request_id=uuid4(),
-    user_prompt="Help me",
-    conversation_history=[...]
-)
-```
-
-### 2️⃣ PrimaryAgent → ReviewerAgent
-```python
-ReviewRequest(
-    request_id=request_id,
-    user_prompt="Help me",
-    conversation_history=[...],
-    primary_agent_response=[ChatMessage(...)]
-)
-```
-
-### 3️⃣ ReviewerAgent Decision
-```python
-ReviewDecision(
-    approved=True/False,
-    feedback="..."
-)
-```
-
-### 4️⃣ Output
-- **If approved**: `AgentRunUpdateEvent` → User
-- **If rejected**: `ReviewResponse` → PrimaryAgent (loop to step 2)
-
-## Common Tasks
-
-### Enable Streaming
-```python
-agent.set_websocket_manager(ws_manager)
-```
-
-### Enable MCP Tools
-```bash
-export MCP_SERVER_URI=http://localhost:5000/mcp
-```
-
-### Access History
-```python
-history = agent.chat_history  # List of dicts
-# or
-history = agent._conversation_history  # List of ChatMessage
-```
-
-### Run Tests
-```bash
-python agentic_ai/agents/agent_framework/multi_agent/test_reflection_workflow_agent.py
-```
-
-## Environment Variables
-
-```bash
-# Required
-AZURE_OPENAI_API_KEY=sk-...
-AZURE_OPENAI_CHAT_DEPLOYMENT=gpt-4
-AZURE_OPENAI_ENDPOINT=https://....openai.azure.com/
-AZURE_OPENAI_API_VERSION=2024-02-15-preview
-OPENAI_MODEL_NAME=gpt-4
-
-# Optional
-MCP_SERVER_URI=http://localhost:5000/mcp
-```
-
-## Streaming Events
-
-| Event Type | When | Purpose |
-|------------|------|---------|
-| `orchestrator` | Start/Progress/End | Workflow status |
-| `agent_start` | Agent begins | Show agent badge |
-| `agent_token` | Token generated | Stream text |
-| `agent_message` | Agent completes | Full message |
-| `tool_called` | Tool invoked | Show tool usage |
-| `final_result` | Workflow done | Final response |
-
-## Debug Checklist
-
-❓ **Not working?**
-1. Check environment variables are set
-2. Verify MCP server is running (if using tools)
-3. Enable debug logging: `logging.basicConfig(level=logging.DEBUG)`
-4. Check WebSocket manager is set (for streaming)
-5. Review logs for error messages
-
-❓ **Infinite loop?**
-1. Check reviewer criteria are achievable
-2. Add max refinement counter
-3. Review feedback content for clarity
-
-❓ **No MCP tools?**
-1. Verify `MCP_SERVER_URI` is set
-2. Test MCP server: `curl $MCP_SERVER_URI/health`
-3. Check access token is valid
-
-## Comparison Matrix
-
-| Feature | Traditional | Workflow | Winner |
-|---------|------------|----------|--------|
-| Control Flow | Implicit | Explicit | 🏆 Workflow |
-| Testability | Medium | High | 🏆 Workflow |
-| Scalability | Limited | High | 🏆 Workflow |
-| Learning Curve | Low | Medium | 🥈 Traditional |
-| State Management | Manual | Auto | 🏆 Workflow |
-| Debugging | Hard | Easy | 🏆 Workflow |
-
-## Code Snippets
-
-### Backend Integration (FastAPI)
-```python
-@app.post("/chat")
-async def chat(session_id: str, message: str):
-    agent = Agent(state_store, session_id)
-    response = await agent.chat_async(message)
-    return {"response": response}
-```
-
-### Frontend Integration (React)
-```tsx
-const [response, setResponse] = useState('');
-ws.onmessage = (event) => {
-  const data = JSON.parse(event.data);
-  if (data.type === 'final_result') {
-    setResponse(data.content);
-  }
-};
-```
-
-### Streamlit Integration
-```python
-agent = Agent(st.session_state.state_store, session_id)
-if prompt := st.chat_input("Ask..."):
-    response = asyncio.run(agent.chat_async(prompt))
-    st.chat_message("assistant").write(response)
-```
-
-## Performance Tips
-
-✅ **DO:**
-- Use streaming for better UX
-- Enable debug logging during development
-- Implement retry logic for MCP tools
-- Cache frequent queries
-- Monitor refinement counts
-
-❌ **DON'T:**
-- Allow unlimited refinement loops
-- Log sensitive customer data
-- Skip error handling
-- Forget to persist state
-- Ignore WebSocket errors
-
-## Workflow Builder Pattern
-
-```python
-workflow = (
-    WorkflowBuilder()
-    .add_edge(executor_a, executor_b)  # A → B
-    .add_edge(executor_b, executor_a)  # B → A (feedback)
-    .set_start_executor(executor_a)     # Start with A
-    .build()                            # Build workflow
-    .as_agent()                         # Expose as agent
-)
-```
-
-## Executor Handlers
-
-```python
-class MyExecutor(Executor):
-    @handler
-    async def handle_message(
-        self, 
-        request: RequestType,
-        ctx: WorkflowContext[ResponseType]
-    ) -> None:
-        # Process request
-        result = await self.process(request)
-        
-        # Send to next executor
-        await ctx.send_message(result)
-        
-        # Or emit to user
-        await ctx.add_event(
-            AgentRunUpdateEvent(
-                self.id, 
-                data=AgentRunResponseUpdate(...)
-            )
-        )
-```
-
-## Structured Output
-
-```python
-from pydantic import BaseModel
-
-class MyResponse(BaseModel):
-    field1: str
-    field2: bool
-
-# Use in chat client
-response = await chat_client.get_response(
-    messages=[...],
-    response_format=MyResponse
-)
-
-# Parse
-parsed = MyResponse.model_validate_json(response.text)
-```
-
-## Logging Best Practices
-
-```python
-import logging
-
-logger = logging.getLogger(__name__)
-
-# In executor
-logger.info(f"[{self.id}] Processing request {request_id[:8]}")
-logger.debug(f"[{self.id}] Full request: {request}")
-logger.error(f"[{self.id}] Error: {e}", exc_info=True)
-```
-
-## Testing Patterns
-
-```python
-@pytest.fixture
-def agent():
-    return Agent(state_store={}, session_id="test")
-
-@pytest.mark.asyncio
-async def test_chat(agent):
-    response = await agent.chat_async("Hello")
-    assert response is not None
-    assert len(response) > 0
-
-@pytest.mark.asyncio
-async def test_history(agent):
-    await agent.chat_async("My name is John")
-    response = await agent.chat_async("What is my name?")
-    assert "john" in response.lower()
-```
-
-## Common Pitfalls
-
-🔴 **Pitfall 1**: Not setting start executor
-```python
-# Wrong
-WorkflowBuilder().add_edge(a, b).build()
-
-# Right
-WorkflowBuilder().add_edge(a, b).set_start_executor(a).build()
-```
-
-🔴 **Pitfall 2**: Missing return edges
-```python
-# Wrong (one-way only)
-.add_edge(primary, reviewer)
-
-# Right (bidirectional for loops)
-.add_edge(primary, reviewer)
-.add_edge(reviewer, primary)
-```
-
-🔴 **Pitfall 3**: Not handling async properly
-```python
-# Wrong
-response = agent.chat_async(prompt)
-
-# Right
-response = await agent.chat_async(prompt)
-# or
-response = asyncio.run(agent.chat_async(prompt))
-```
-
-## Links
-
-📚 **Documentation**
-- [Full README](WORKFLOW_REFLECTION_README.md)
-- [Diagrams](WORKFLOW_DIAGRAMS.md)
-- [Integration Guide](INTEGRATION_GUIDE.md)
-- [Project Summary](PROJECT_SUMMARY.md)
-
-🔧 **Code**
-- [Implementation](reflection_workflow_agent.py)
-- [Tests](test_reflection_workflow_agent.py)
-
-📖 **Examples**
-- Agent Framework Samples in `reference/agent-framework/`
-
-## Support
-
-1. Check docs ↑
-2. Run tests
-3. Enable debug logging
-4. Review error messages
-5. Check environment vars
-
-## Version Info
-
-- **Version**: 1.0.0
-- **Status**: ✅ Production Ready
-- **Python**: 3.10+
-- **Dependencies**: agent-framework, pydantic, azure-identity
-
----
-
-**TIP**: Bookmark this page for quick reference! 📌
diff --git a/agentic_ai/agents/agent_framework/multi_agent/WORKFLOW_DIAGRAMS.md b/agentic_ai/agents/agent_framework/multi_agent/WORKFLOW_DIAGRAMS.md
deleted file mode 100644
index 065a5f1e2..000000000
--- a/agentic_ai/agents/agent_framework/multi_agent/WORKFLOW_DIAGRAMS.md
+++ /dev/null
@@ -1,337 +0,0 @@
-# Workflow-Based Reflection Agent - Architecture Diagrams
-
-## 3-Party Communication Flow
-
-```mermaid
-graph TD
-    User[User] -->|PrimaryAgentRequest| PA[PrimaryAgent Executor]
-    PA -->|ReviewRequest| RA[ReviewerAgent Executor]
-    RA -->|ReviewResponse approved=false| PA
-    RA -->|AgentRunUpdateEvent approved=true| User
-    
-    style User fill:#e1f5ff
-    style PA fill:#fff4e1
-    style RA fill:#e8f5e8
-```
-
-## Detailed Workflow Execution
-
-```mermaid
-sequenceDiagram
-    participant User
-    participant WorkflowAgent
-    participant PrimaryAgent
-    participant ReviewerAgent
-    
-    User->>WorkflowAgent: chat_async("Help with customer 1")
-    WorkflowAgent->>PrimaryAgent: PrimaryAgentRequest<br/>(prompt + history)
-    
-    Note over PrimaryAgent: Generate response<br/>using MCP tools<br/>and conversation history
-    
-    PrimaryAgent->>ReviewerAgent: ReviewRequest<br/>(prompt + history + response)
-    
-    Note over ReviewerAgent: Evaluate response quality<br/>Check accuracy, completeness,<br/>professionalism
-    
-    alt Response Approved
-        ReviewerAgent->>ReviewerAgent: AgentRunUpdateEvent
-        ReviewerAgent->>WorkflowAgent: Emit to user
-        WorkflowAgent->>User: Final response
-    else Response Rejected
-        ReviewerAgent->>PrimaryAgent: ReviewResponse<br/>(approved=false, feedback)
-        
-        Note over PrimaryAgent: Incorporate feedback<br/>Regenerate response
-        
-        PrimaryAgent->>ReviewerAgent: ReviewRequest<br/>(refined response)
-        
-        Note over ReviewerAgent: Re-evaluate
-        
-        ReviewerAgent->>ReviewerAgent: AgentRunUpdateEvent
-        ReviewerAgent->>WorkflowAgent: Emit to user
-        WorkflowAgent->>User: Final response
-    end
-```
-
-## Message Types
-
-```mermaid
-classDiagram
-    class PrimaryAgentRequest {
-        +str request_id
-        +str user_prompt
-        +list~ChatMessage~ conversation_history
-    }
-    
-    class ReviewRequest {
-        +str request_id
-        +str user_prompt
-        +list~ChatMessage~ conversation_history
-        +list~ChatMessage~ primary_agent_response
-    }
-    
-    class ReviewResponse {
-        +str request_id
-        +bool approved
-        +str feedback
-    }
-    
-    class ReviewDecision {
-        +bool approved
-        +str feedback
-    }
-    
-    PrimaryAgentRequest --> ReviewRequest : transforms to
-    ReviewRequest --> ReviewDecision : evaluates into
-    ReviewDecision --> ReviewResponse : converts to
-    ReviewResponse --> PrimaryAgentRequest : triggers retry if rejected
-```
-
-## Workflow Graph Structure
-
-```mermaid
-graph LR
-    Start([Start]) --> PA[PrimaryAgent<br/>Executor]
-    PA -->|ReviewRequest| RA[ReviewerAgent<br/>Executor]
-    RA -->|ReviewResponse<br/>approved=false| PA
-    RA -->|AgentRunUpdateEvent<br/>approved=true| End([User])
-    
-    style Start fill:#90EE90
-    style End fill:#FFB6C1
-    style PA fill:#FFE4B5
-    style RA fill:#E0BBE4
-```
-
-## State Management
-
-```mermaid
-stateDiagram-v2
-    [*] --> UserInput: User sends prompt
-    
-    UserInput --> PrimaryGenerate: Create PrimaryAgentRequest<br/>with conversation history
-    
-    PrimaryGenerate --> ReviewEvaluate: Send ReviewRequest<br/>to ReviewerAgent
-    
-    ReviewEvaluate --> Approved: Quality check passes
-    ReviewEvaluate --> Rejected: Quality check fails
-    
-    Rejected --> PrimaryRefinement: Send ReviewResponse<br/>with feedback
-    
-    PrimaryRefinement --> ReviewEvaluate: Send refined ReviewRequest
-    
-    Approved --> EmitToUser: AgentRunUpdateEvent
-    
-    EmitToUser --> UpdateHistory: Add to conversation history
-    
-    UpdateHistory --> [*]: Return response to user
-    
-    note right of ReviewEvaluate
-        Conditional Gate:
-        - Accuracy
-        - Completeness
-        - Professionalism
-        - Tool usage
-        - Clarity
-    end note
-    
-    note right of PrimaryRefinement
-        Incorporate feedback:
-        - Add reviewer feedback to context
-        - Regenerate response
-        - Maintain conversation history
-    end note
-```
-
-## Conversation History Flow
-
-```mermaid
-graph TB
-    subgraph "State Store"
-        History[Conversation History<br/>User ↔ PrimaryAgent only]
-    end
-    
-    subgraph "Request 1"
-        U1[User: Query 1] --> P1[PrimaryAgent]
-        P1 --> R1[ReviewerAgent]
-        R1 -->|approved| H1[Add to History]
-        H1 --> History
-    end
-    
-    subgraph "Request 2 with History"
-        History --> P2[PrimaryAgent<br/>receives history]
-        U2[User: Query 2] --> P2
-        P2 --> R2[ReviewerAgent<br/>receives history]
-        R2 -->|approved| H2[Add to History]
-        H2 --> History
-    end
-    
-    style History fill:#FFE4E1
-    style H1 fill:#90EE90
-    style H2 fill:#90EE90
-```
-
-## Comparison: Traditional vs Workflow
-
-```mermaid
-graph TB
-    subgraph "Traditional Reflection Agent"
-        T1[Agent.run Step 1:<br/>Primary generates] --> T2[Agent.run Step 2:<br/>Reviewer evaluates]
-        T2 --> T3{Approved?}
-        T3 -->|No| T4[Agent.run Step 3:<br/>Primary refines]
-        T4 --> T2
-        T3 -->|Yes| T5[Return to user]
-        
-        style T1 fill:#FFE4B5
-        style T2 fill:#E0BBE4
-        style T4 fill:#FFE4B5
-    end
-    
-    subgraph "Workflow Reflection Agent"
-        W1[PrimaryAgentExecutor<br/>handles request] --> W2[ReviewerAgentExecutor<br/>evaluates]
-        W2 --> W3{Approved?}
-        W3 -->|No| W4[PrimaryAgentExecutor<br/>handles feedback]
-        W4 --> W2
-        W3 -->|Yes| W5[AgentRunUpdateEvent<br/>to user]
-        
-        style W1 fill:#FFE4B5
-        style W2 fill:#E0BBE4
-        style W4 fill:#FFE4B5
-        style W5 fill:#90EE90
-    end
-```
-
-## MCP Tool Integration
-
-```mermaid
-graph LR
-    subgraph "Workflow"
-        PA[PrimaryAgent] --> RA[ReviewerAgent]
-        RA --> PA
-    end
-    
-    subgraph "MCP Tools"
-        T1[get_customer_detail]
-        T2[get_billing_summary]
-        T3[get_promotions]
-        T4[search_knowledge_base]
-    end
-    
-    PA -.->|Uses tools| T1
-    PA -.->|Uses tools| T2
-    PA -.->|Uses tools| T3
-    PA -.->|Uses tools| T4
-    
-    RA -.->|May use tools<br/>to verify| T1
-    
-    subgraph "MCP Server"
-        MCP[HTTP MCP Server<br/>:5000/mcp]
-    end
-    
-    T1 --> MCP
-    T2 --> MCP
-    T3 --> MCP
-    T4 --> MCP
-    
-    style PA fill:#FFE4B5
-    style RA fill:#E0BBE4
-    style MCP fill:#E1F5FF
-```
-
-## Error Handling Flow
-
-```mermaid
-graph TD
-    Start([User Query]) --> Init[Initialize Workflow]
-    
-    Init --> CheckEnv{Env Config OK?}
-    CheckEnv -->|No| Error1[Raise RuntimeError]
-    CheckEnv -->|Yes| CreateReq[Create PrimaryAgentRequest]
-    
-    CreateReq --> PA[PrimaryAgent Process]
-    
-    PA --> CheckPA{Primary Success?}
-    CheckPA -->|Error| Error2[Log error + Raise]
-    CheckPA -->|Success| RA[ReviewerAgent Process]
-    
-    RA --> CheckRA{Review Success?}
-    CheckRA -->|Error| Error3[Log error + Raise]
-    CheckRA -->|Success| Decision{Approved?}
-    
-    Decision -->|Yes| Success[Return to User]
-    Decision -->|No| CheckRetry{Max Retries?}
-    
-    CheckRetry -->|Exceeded| Error4[Log warning + Return best attempt]
-    CheckRetry -->|Continue| PA
-    
-    style Error1 fill:#FFB6C1
-    style Error2 fill:#FFB6C1
-    style Error3 fill:#FFB6C1
-    style Error4 fill:#FFE4B5
-    style Success fill:#90EE90
-```
-
-## Streaming Events Flow
-
-```mermaid
-sequenceDiagram
-    participant User
-    participant Backend
-    participant WorkflowAgent
-    participant WebSocket
-    
-    User->>Backend: Send query
-    Backend->>WorkflowAgent: chat_async(query)
-    
-    WorkflowAgent->>WebSocket: orchestrator: "plan"<br/>"Workflow starting..."
-    WebSocket->>User: Display plan
-    
-    WorkflowAgent->>WebSocket: agent_start: "primary_agent"
-    WebSocket->>User: Show agent badge
-    
-    loop Primary Generation
-        WorkflowAgent->>WebSocket: agent_token: chunk
-        WebSocket->>User: Stream text
-    end
-    
-    WorkflowAgent->>WebSocket: agent_message: complete
-    WebSocket->>User: Display message
-    
-    WorkflowAgent->>WebSocket: orchestrator: "progress"<br/>"Reviewer evaluating..."
-    WebSocket->>User: Update progress
-    
-    WorkflowAgent->>WebSocket: agent_start: "reviewer_agent"
-    WebSocket->>User: Show reviewer badge
-    
-    loop Reviewer Evaluation
-        WorkflowAgent->>WebSocket: agent_token: chunk
-        WebSocket->>User: Stream text
-    end
-    
-    alt Approved
-        WorkflowAgent->>WebSocket: orchestrator: "result"<br/>"Approved!"
-        WorkflowAgent->>WebSocket: final_result: response
-        WebSocket->>User: Display final response
-    else Rejected
-        WorkflowAgent->>WebSocket: orchestrator: "progress"<br/>"Refining..."
-        Note over WorkflowAgent: Loop back to Primary
-    end
-```
-
----
-
-## How to View These Diagrams
-
-These diagrams use Mermaid syntax, which is supported by:
-
-1. **GitHub**: Automatically rendered in Markdown files
-2. **VS Code**: Install "Markdown Preview Mermaid Support" extension
-3. **Online**: Copy to https://mermaid.live
-4. **Documentation sites**: GitBook, Docusaurus, etc.
-
-## Legend
-
-- 🟢 **Green**: Success/approval states
-- 🟡 **Yellow**: Processing/agent executors
-- 🟣 **Purple**: Review/evaluation
-- 🔵 **Blue**: User/external
-- 🔴 **Red**: Error states
-- ➡️ **Solid arrows**: Direct message flow
-- ⤏ **Dashed arrows**: Tool calls/side effects
diff --git a/agentic_ai/agents/agent_framework/multi_agent/WORKFLOW_REFLECTION_README.md b/agentic_ai/agents/agent_framework/multi_agent/WORKFLOW_REFLECTION_README.md
deleted file mode 100644
index fe91765f2..000000000
--- a/agentic_ai/agents/agent_framework/multi_agent/WORKFLOW_REFLECTION_README.md
+++ /dev/null
@@ -1,345 +0,0 @@
-# Workflow-Based Reflection Agent
-
-A workflow implementation of the reflection pattern using Agent Framework's `WorkflowBuilder`, featuring a 3-party communication design with quality assurance gates.
-
-## Overview
-
-This agent implements a sophisticated reflection pattern where responses are iteratively refined until they meet quality standards. Unlike the traditional two-agent reflection pattern, this uses a workflow-based approach with explicit conditional routing.
-
-## Architecture
-
-### 3-Party Communication Pattern
-
-```
-User → PrimaryAgent → ReviewerAgent → {approve: User, reject: PrimaryAgent}
-         ↑                                          |
-         |__________________________________________|
-                    (feedback loop)
-```
-
-**Key Design Principles:**
-
-1. **PrimaryAgent**: Customer support agent that:
-   - Receives user messages with conversation history
-   - Cannot send messages directly to user
-   - All outputs go to ReviewerAgent for evaluation
-   - Uses MCP tools for data retrieval
-
-2. **ReviewerAgent**: Quality assurance gate that:
-   - Evaluates PrimaryAgent responses
-   - Acts as conditional router:
-     - `approve=true` → Emit to user
-     - `approve=false` → Send feedback to PrimaryAgent
-   - Has access to full conversation context
-
-3. **Conversation History**:
-   - Maintained between User and PrimaryAgent only
-   - Both agents receive history for context
-   - Updated only when approved responses are delivered
-
-## Features
-
-✅ **Workflow-Based Architecture**
-- Built using `WorkflowBuilder` for explicit control flow
-- Bidirectional edges between PrimaryAgent and ReviewerAgent
-- Conditional routing based on structured review decisions
-
-✅ **Quality Assurance**
-- Every response is reviewed before reaching the user
-- Structured evaluation criteria:
-  - Accuracy of information
-  - Completeness of answer
-  - Professional tone
-  - Proper tool usage
-  - Clarity and helpfulness
-
-✅ **Iterative Refinement**
-- Failed reviews trigger regeneration with feedback
-- Conversation context preserved across iterations
-- Unlimited refinement cycles until approval
-
-✅ **MCP Tool Integration**
-- Supports MCP tools for external data access
-- Tools available to both agents
-- Proper authentication via bearer tokens
-
-✅ **Streaming Support**
-- WebSocket-based streaming for real-time updates
-- Progress indicators for each workflow stage
-- Token-level streaming for agent responses
-
-## Implementation Details
-
-### Executor Classes
-
-#### `PrimaryAgentExecutor`
-```python
-class PrimaryAgentExecutor(Executor):
-    """
-    Generates customer support responses.
-    Sends all outputs to ReviewerAgent.
-    """
-    
-    @handler
-    async def handle_user_request(
-        self, request: PrimaryAgentRequest, ctx: WorkflowContext[ReviewRequest]
-    ) -> None:
-        # Generate response with conversation history
-        # Send to ReviewerAgent for evaluation
-    
-    @handler
-    async def handle_review_feedback(
-        self, review: ReviewResponse, ctx: WorkflowContext[ReviewRequest]
-    ) -> None:
-        # If not approved: incorporate feedback and regenerate
-        # Send refined response back to ReviewerAgent
-```
-
-#### `ReviewerAgentExecutor`
-```python
-class ReviewerAgentExecutor(Executor):
-    """
-    Evaluates responses and acts as conditional gate.
-    """
-    
-    @handler
-    async def review_response(
-        self, request: ReviewRequest, ctx: WorkflowContext[ReviewResponse]
-    ) -> None:
-        # Evaluate response quality
-        # If approved: emit to user via AgentRunUpdateEvent
-        # If not: send feedback to PrimaryAgent
-```
-
-### Message Flow
-
-1. **User Input**
-   ```python
-   PrimaryAgentRequest(
-       request_id=uuid4(),
-       user_prompt="What is customer 1's billing status?",
-       conversation_history=[...previous messages...]
-   )
-   ```
-
-2. **Primary Agent → Reviewer**
-   ```python
-   ReviewRequest(
-       request_id=request_id,
-       user_prompt="What is customer 1's billing status?",
-       conversation_history=[...],
-       primary_agent_response=[...ChatMessage...]
-   )
-   ```
-
-3. **Reviewer Decision**
-   ```python
-   ReviewDecision(
-       approved=True/False,
-       feedback="Constructive feedback or approval note"
-   )
-   ```
-
-4. **Conditional Routing**
-   - **Approved**: `AgentRunUpdateEvent` → User
-   - **Rejected**: `ReviewResponse` → PrimaryAgent → Loop back to step 2
-
-### Workflow Graph
-
-```python
-workflow = (
-    WorkflowBuilder()
-    .add_edge(primary_agent, reviewer_agent)  # Forward path
-    .add_edge(reviewer_agent, primary_agent)  # Feedback path
-    .set_start_executor(primary_agent)
-    .build()
-    .as_agent()  # Expose as standard agent interface
-)
-```
-
-## Usage
-
-### Basic Usage
-
-```python
-from agentic_ai.agents.agent_framework.multi_agent.reflection_workflow_agent import Agent
-
-# Create agent instance
-state_store = {}
-session_id = "user_session_123"
-agent = Agent(state_store=state_store, session_id=session_id)
-
-# Process user query
-response = await agent.chat_async("Can you help me with customer ID 1?")
-print(response)
-```
-
-### With Streaming
-
-```python
-# Set WebSocket manager for streaming updates
-agent.set_websocket_manager(ws_manager)
-
-# Chat will now stream progress updates
-response = await agent.chat_async("What promotions are available?")
-```
-
-### With MCP Tools
-
-```python
-# Set MCP_SERVER_URI environment variable
-os.environ["MCP_SERVER_URI"] = "http://localhost:5000/mcp"
-
-# Agent will automatically use MCP tools
-agent = Agent(state_store=state_store, session_id=session_id, access_token=token)
-response = await agent.chat_async("Get billing summary for customer 1")
-```
-
-## Environment Variables
-
-Required:
-- `AZURE_OPENAI_API_KEY`: Azure OpenAI API key
-- `AZURE_OPENAI_CHAT_DEPLOYMENT`: Deployment name
-- `AZURE_OPENAI_ENDPOINT`: Azure OpenAI endpoint URL
-- `AZURE_OPENAI_API_VERSION`: API version (e.g., "2024-02-15-preview")
-- `OPENAI_MODEL_NAME`: Model name (e.g., "gpt-4")
-
-Optional:
-- `MCP_SERVER_URI`: URI for MCP server (enables tool usage)
-
-## Testing
-
-Run the test script:
-
-```bash
-# From project root
-python agentic_ai/agents/agent_framework/multi_agent/test_reflection_workflow_agent.py
-```
-
-The test script will:
-1. Verify environment configuration
-2. Run basic queries
-3. Test MCP tool integration (if configured)
-4. Display conversation history
-
-## Comparison: Workflow vs Traditional
-
-### Traditional Reflection Agent (`reflection_agent.py`)
-- Direct agent-to-agent communication via `run()` calls
-- Sequential execution (Step 1 → Step 2 → Step 3)
-- Implicit control flow
-- Manual state management
-
-### Workflow Reflection Agent (`reflection_workflow_agent.py`)
-- Message-based communication via `WorkflowContext`
-- Graph-based execution (workflow edges)
-- Explicit conditional routing
-- Framework-managed state
-- Better scalability for complex workflows
-
-## Advanced Features
-
-### Custom Review Criteria
-
-Modify the ReviewerAgent's system prompt to enforce custom quality standards:
-
-```python
-# In ReviewerAgentExecutor.__init__
-custom_criteria = """
-Review for:
-1. Response time < 2 seconds
-2. Includes specific customer name
-3. References at least 2 data points
-4. Professional greeting and closing
-"""
-```
-
-### Multiple Refinement Rounds Limit
-
-Add a counter to prevent infinite loops:
-
-```python
-class PrimaryAgentExecutor(Executor):
-    def __init__(self, max_refinements: int = 3):
-        self._max_refinements = max_refinements
-        self._refinement_counts = {}
-    
-    async def handle_review_feedback(self, review, ctx):
-        count = self._refinement_counts.get(review.request_id, 0)
-        if count >= self._max_refinements:
-            # Force approval or escalate
-            return
-```
-
-### Logging and Monitoring
-
-All workflow events are logged with structured information:
-
-```python
-logger.info(f"[PrimaryAgent] Processing request {request_id[:8]}")
-logger.info(f"[ReviewerAgent] Review decision - Approved: {approved}")
-```
-
-Enable debug logging for detailed traces:
-
-```python
-logging.basicConfig(level=logging.DEBUG)
-```
-
-## Best Practices
-
-1. **Conversation History Management**
-   - Keep history concise (last N messages)
-   - Summarize old conversations for long sessions
-
-2. **Error Handling**
-   - Handle MCP tool failures gracefully
-   - Implement retry logic with exponential backoff
-
-3. **Performance**
-   - Use streaming for better user experience
-   - Consider caching for frequent queries
-
-4. **Security**
-   - Always validate MCP tool responses
-   - Sanitize user inputs
-   - Use bearer tokens for authentication
-
-## Troubleshooting
-
-### Common Issues
-
-**Issue**: Agent not using MCP tools
-- **Solution**: Verify `MCP_SERVER_URI` is set and server is running
-
-**Issue**: Infinite refinement loop
-- **Solution**: Check ReviewerAgent criteria are achievable, add max refinement limit
-
-**Issue**: Missing conversation context
-- **Solution**: Ensure history is properly loaded from state_store
-
-**Issue**: Workflow hangs
-- **Solution**: Check for unhandled message types, verify all edges are configured
-
-## Future Enhancements
-
-- [ ] Support for multi-modal inputs (images, files)
-- [ ] Parallel reviewer agents (consensus-based approval)
-- [ ] A/B testing of different review criteria
-- [ ] Metrics and analytics dashboard
-- [ ] Human-in-the-loop escalation for uncertain cases
-- [ ] Fine-tuned reviewer models
-
-## Related Examples
-
-- `reference/agent-framework/python/samples/getting_started/workflows/agents/workflow_as_agent_reflection_pattern_azure.py` - Two-agent reflection
-- `reference/agent-framework/python/samples/getting_started/workflows/agents/workflow_as_agent_human_in_the_loop_azure.py` - Human escalation
-- `reference/agent-framework/python/samples/getting_started/workflows/control-flow/edge_condition.py` - Conditional routing
-
-## License
-
-This code is part of the OpenAI Workshop project. See LICENSE file for details.
-
-## Contributing
-
-Contributions are welcome! Please follow the project's contribution guidelines.
diff --git a/agentic_ai/agents/agent_framework/multi_agent/handoff_multi_domain_agent.py b/agentic_ai/agents/agent_framework/multi_agent/handoff_multi_domain_agent.py
index 059a3ac6f..dca76b14a 100644
--- a/agentic_ai/agents/agent_framework/multi_agent/handoff_multi_domain_agent.py
+++ b/agentic_ai/agents/agent_framework/multi_agent/handoff_multi_domain_agent.py
@@ -25,7 +25,7 @@
 from agent_framework import ChatAgent, ChatMessage, Role, MCPStreamableHTTPTool
 from agent_framework.azure import AzureOpenAIChatClient
 
-from agents.base_agent import BaseAgent
+from agents.base_agent import BaseAgent, ToolCallTrackingMixin
 from agents.agent_framework.utils import create_filtered_tool_list
 
 logger = logging.getLogger(__name__)
@@ -158,7 +158,7 @@ class IntentClassification(BaseModel):
 """
 
 
-class Agent(BaseAgent):
+class Agent(ToolCallTrackingMixin, BaseAgent):
     """
     Optimized handoff pattern using vanilla workflow and direct agent communication.
     
@@ -184,6 +184,9 @@ def __init__(self, state_store: Dict[str, Any], session_id: str, access_token: s
         self._turn_key = f"{session_id}_handoff_turn"
         self._current_turn = state_store.get(self._turn_key, 0)
         
+        # Initialize tool tracking from mixin
+        self.init_tool_tracking()
+        
         # Context transfer configuration: -1 = all history, 0 = none, N = last N turns
         self._context_transfer_turns = int(os.getenv("HANDOFF_CONTEXT_TRANSFER_TURNS", "-1"))
         
@@ -510,6 +513,9 @@ async def chat_async(self, prompt: str) -> str:
         """
         await self._setup_agents()
 
+        # Clear tool calls from previous request (from mixin)
+        self.clear_tool_calls()
+
         # Increment turn counter
         self._current_turn += 1
         self.state_store[self._turn_key] = self._current_turn
@@ -589,18 +595,31 @@ async def chat_async(self, prompt: str) -> str:
                 # Process contents in the chunk
                 if hasattr(chunk, 'contents') and chunk.contents:
                     for content in chunk.contents:
-                        # Check for tool/function calls
+                        # Check for tool/function calls - track with arguments
                         if content.type == "function_call":
-                            if self._ws_manager:
-                                await self._ws_manager.broadcast(
-                                    self.session_id,
-                                    {
-                                        "type": "tool_called",
-                                        "agent_id": target_domain,
-                                        "tool_name": content.name,
-                                        "turn": self._current_turn,
-                                    },
-                                )
+                            if content.name:
+                                # New function call - finalize previous and start new
+                                self.track_function_call_start(content.name)
+                                
+                                if self._ws_manager:
+                                    await self._ws_manager.broadcast(
+                                        self.session_id,
+                                        {
+                                            "type": "tool_called",
+                                            "agent_id": target_domain,
+                                            "tool_name": content.name,
+                                            "turn": self._current_turn,
+                                        },
+                                    )
+                            
+                            # Accumulate arguments
+                            args_chunk = getattr(content, 'arguments', '')
+                            if args_chunk:
+                                self.track_function_call_arguments(args_chunk)
+                        
+                        elif content.type == "function_result":
+                            # Function completed - finalize
+                            self.finalize_tool_tracking()
                 
                 # Extract text from chunk
                 if hasattr(chunk, 'text') and chunk.text:
@@ -620,6 +639,9 @@ async def chat_async(self, prompt: str) -> str:
         except Exception as exc:
             logger.error(f"[HANDOFF] Error during agent streaming: {exc}", exc_info=True)
             raise
+        
+        # Finalize any remaining function call
+        self.finalize_tool_tracking()
 
         assistant_response = ''.join(full_response)
 
@@ -682,16 +704,26 @@ async def chat_async(self, prompt: str) -> str:
                         if hasattr(chunk, 'contents') and chunk.contents:
                             for content in chunk.contents:
                                 if content.type == "function_call":
-                                    if self._ws_manager:
-                                        await self._ws_manager.broadcast(
-                                            self.session_id,
-                                            {
-                                                "type": "tool_called",
-                                                "agent_id": new_target_domain,
-                                                "tool_name": content.name,
-                                                "turn": self._current_turn,
-                                            },
-                                        )
+                                    if content.name:
+                                        self.track_function_call_start(content.name)
+                                        
+                                        if self._ws_manager:
+                                            await self._ws_manager.broadcast(
+                                                self.session_id,
+                                                {
+                                                    "type": "tool_called",
+                                                    "agent_id": new_target_domain,
+                                                    "tool_name": content.name,
+                                                    "turn": self._current_turn,
+                                                },
+                                            )
+                                    
+                                    args_chunk = getattr(content, 'arguments', '')
+                                    if args_chunk:
+                                        self.track_function_call_arguments(args_chunk)
+                                
+                                elif content.type == "function_result":
+                                    self.finalize_tool_tracking()
                         
                         if hasattr(chunk, 'text') and chunk.text:
                             full_response_handoff.append(chunk.text)
@@ -709,6 +741,9 @@ async def chat_async(self, prompt: str) -> str:
                     logger.error(f"[HANDOFF] Error during handoff agent streaming: {exc}", exc_info=True)
                     raise
                 
+                # Finalize any remaining function call
+                self.finalize_tool_tracking()
+                
                 # Use handoff response
                 assistant_response = ''.join(full_response_handoff)
                 target_domain = new_target_domain
diff --git a/agentic_ai/agents/agent_framework/multi_agent/magentic_group.py b/agentic_ai/agents/agent_framework/multi_agent/magentic_group.py
index 7460a1eb1..7054bcf2e 100644
--- a/agentic_ai/agents/agent_framework/multi_agent/magentic_group.py
+++ b/agentic_ai/agents/agent_framework/multi_agent/magentic_group.py
@@ -20,7 +20,7 @@
 )
 from agent_framework.azure import AzureOpenAIChatClient  # type: ignore[import]
 
-from agents.base_agent import BaseAgent
+from agents.base_agent import BaseAgent, ToolCallTrackingMixin
 from agents.agent_framework.utils import create_filtered_tool_list
 
 logger = logging.getLogger(__name__)
@@ -104,7 +104,7 @@ def clear_all(self) -> None:
             self._backing.pop("pending_prompt", None)
 
 
-class Agent(BaseAgent):
+class Agent(ToolCallTrackingMixin, BaseAgent):
     """Agent Framework implementation of the collaborative Magentic team."""
 
     DEFAULT_MANAGER_INSTRUCTIONS = (
@@ -226,6 +226,9 @@ def __init__(
         self._stream_agent_id: Optional[str] = None
         self._stream_line_open: bool = False
         self._last_agent_message: Optional[str] = None  # Track last agent message for deduplication
+        
+        # Initialize tool tracking from mixin
+        self.init_tool_tracking()
 
     def set_websocket_manager(self, manager: Any) -> None:
         """Allow backend to inject WebSocket manager for streaming events."""
diff --git a/agentic_ai/agents/agent_framework/multi_agent/reflection_agent.py b/agentic_ai/agents/agent_framework/multi_agent/reflection_agent.py
index 5a61e43e7..e06b020c8 100644
--- a/agentic_ai/agents/agent_framework/multi_agent/reflection_agent.py
+++ b/agentic_ai/agents/agent_framework/multi_agent/reflection_agent.py
@@ -13,7 +13,7 @@
 from agent_framework import AgentThread, ChatAgent, MCPStreamableHTTPTool
 from agent_framework.azure import AzureOpenAIChatClient
 
-from agents.base_agent import BaseAgent
+from agents.base_agent import BaseAgent, ToolCallTrackingMixin
 
 logger = logging.getLogger(__name__)
 
@@ -37,7 +37,7 @@
 }
 
 
-class Agent(BaseAgent):
+class Agent(ToolCallTrackingMixin, BaseAgent):
     """Reflection Agent with Primary Agent + Reviewer workflow."""
 
     def __init__(
@@ -55,6 +55,8 @@ def __init__(
         self._access_token = access_token
         self._ws_manager = None
         self._max_refinements = max_refinements
+        # Initialize tool tracking from mixin
+        self.init_tool_tracking()
         logger.info(f"[Reflection] Initialized session: {session_id}")
 
     def set_websocket_manager(self, manager: Any) -> None:
@@ -154,12 +156,48 @@ async def _run_agent(
         prompt: str, 
         agent_id: str,
     ) -> str:
-        """Run an agent with optional streaming."""
+        """Run an agent with optional streaming.
+        
+        Even without WebSocket, we use run_stream to capture tool calls for evaluation.
+        """
         if self._ws_manager:
             return await self._run_agent_streaming(agent, prompt, agent_id)
         else:
-            result = await agent.run(prompt, thread=self._thread)
-            return result.text
+            # Use run_stream even without WebSocket to capture tool calls
+            return await self._run_agent_non_streaming(agent, prompt, agent_id)
+    
+    async def _run_agent_non_streaming(
+        self,
+        agent: ChatAgent,
+        prompt: str,
+        agent_id: str,
+    ) -> str:
+        """Run agent without WebSocket but still capture tool calls."""
+        chunks: List[str] = []
+        
+        async for chunk in agent.run_stream(prompt, thread=self._thread):
+            # Track tool calls for evaluation
+            if hasattr(chunk, 'contents') and chunk.contents:
+                for content in chunk.contents:
+                    if content.type == "function_call":
+                        if content.name:
+                            self.track_function_call_start(content.name)
+                        
+                        args_chunk = getattr(content, 'arguments', '')
+                        if args_chunk:
+                            self.track_function_call_arguments(args_chunk)
+                    
+                    elif content.type == "function_result":
+                        self.finalize_tool_tracking()
+            
+            # Collect text
+            if hasattr(chunk, 'text') and chunk.text:
+                chunks.append(chunk.text)
+        
+        # Finalize any remaining function call
+        self.finalize_tool_tracking()
+        
+        return ''.join(chunks)
 
     async def _run_agent_streaming(
         self, 
@@ -179,15 +217,25 @@ async def _run_agent_streaming(
         chunks: List[str] = []
         
         async for chunk in agent.run_stream(prompt, thread=self._thread):
-            # Handle tool calls
+            # Handle tool calls with argument tracking
             if hasattr(chunk, 'contents') and chunk.contents:
                 for content in chunk.contents:
                     if content.type == "function_call":
-                        await self._broadcast_raw({
-                            "type": "tool_called",
-                            "agent_id": agent_id,
-                            "tool_name": content.name,
-                        })
+                        if content.name:
+                            self.track_function_call_start(content.name)
+                            
+                            await self._broadcast_raw({
+                                "type": "tool_called",
+                                "agent_id": agent_id,
+                                "tool_name": content.name,
+                            })
+                        
+                        args_chunk = getattr(content, 'arguments', '')
+                        if args_chunk:
+                            self.track_function_call_arguments(args_chunk)
+                    
+                    elif content.type == "function_result":
+                        self.finalize_tool_tracking()
             
             # Stream text
             if hasattr(chunk, 'text') and chunk.text:
@@ -198,6 +246,9 @@ async def _run_agent_streaming(
                     "content": chunk.text,
                 })
         
+        # Finalize any remaining function call
+        self.finalize_tool_tracking()
+        
         response = ''.join(chunks)
         
         # Send complete message
@@ -222,6 +273,9 @@ async def chat_async(self, prompt: str) -> str:
         if not self._primary_agent or not self._reviewer or not self._thread:
             raise RuntimeError("Agents not initialized")
 
+        # Clear tool calls from previous request (from mixin)
+        self.clear_tool_calls()
+
         # Notify start
         await self._broadcast("plan", "🔄 Reflection Workflow\n\nStarting Primary Agent → Reviewer pipeline...")
 
diff --git a/agentic_ai/agents/agent_framework/multi_agent/reflection_workflow_agent.py b/agentic_ai/agents/agent_framework/multi_agent/reflection_workflow_agent.py
deleted file mode 100644
index c23d3882e..000000000
--- a/agentic_ai/agents/agent_framework/multi_agent/reflection_workflow_agent.py
+++ /dev/null
@@ -1,645 +0,0 @@
-"""
-Agent Framework Workflow-based Reflection Agent
-
-This implementation uses the WorkflowBuilder pattern with a 3-party communication flow:
-User -> PrimaryAgent -> ReviewerAgent -> User (if approved) OR back to PrimaryAgent (if rejected)
-
-Key Design:
-- PrimaryAgent receives user messages but cannot send directly to user
-- All PrimaryAgent outputs go to ReviewerAgent for evaluation
-- ReviewerAgent acts as a conditional gate: approve or request_for_edit
-- Conversation history is maintained between user and PrimaryAgent only
-- History is passed to both agents for context
-"""
-
-import json
-import logging
-from dataclasses import dataclass
-from typing import Any, Dict, List
-from uuid import uuid4
-
-from agent_framework import (
-    AgentRunResponseUpdate,
-    AgentRunUpdateEvent,
-    ChatMessage,
-    Contents,
-    Executor,
-    MCPStreamableHTTPTool,
-    Role,
-    WorkflowBuilder,
-    WorkflowContext,
-    handler,
-)
-from agent_framework.azure import AzureOpenAIChatClient
-from pydantic import BaseModel
-
-from agents.base_agent import BaseAgent
-
-logger = logging.getLogger(__name__)
-
-
-class ReviewDecision(BaseModel):
-    """Structured output from ReviewerAgent for reliable routing."""
-    approved: bool
-    feedback: str
-
-
-@dataclass
-class PrimaryAgentRequest:
-    """Request sent to PrimaryAgent with conversation history."""
-    request_id: str
-    user_prompt: str
-    conversation_history: list[ChatMessage]
-
-
-@dataclass
-class ReviewRequest:
-    """Request sent from PrimaryAgent to ReviewerAgent."""
-    request_id: str
-    user_prompt: str
-    conversation_history: list[ChatMessage]
-    primary_agent_response: list[ChatMessage]
-
-
-@dataclass
-class ReviewResponse:
-    """Response from ReviewerAgent back to PrimaryAgent."""
-    request_id: str
-    approved: bool
-    feedback: str
-
-
-class PrimaryAgentExecutor(Executor):
-    """
-    Primary Agent - Customer Support Agent with MCP tools.
-    Receives user messages and generates responses sent to ReviewerAgent for approval.
-    """
-
-    def __init__(
-        self,
-        id: str,
-        chat_client: AzureOpenAIChatClient,
-        tools: MCPStreamableHTTPTool | None = None,
-        model: str | None = None,
-        max_refinements: int = 3,
-    ) -> None:
-        super().__init__(id=id)
-        self._chat_client = chat_client
-        self._tools = tools
-        self._model = model
-        self._max_refinements = max_refinements
-        # Track pending requests for retry with feedback
-        self._pending_requests: dict[str, tuple[PrimaryAgentRequest, list[ChatMessage]]] = {}
-        # Track refinement counts to prevent infinite loops
-        self._refinement_counts: dict[str, int] = {}
-
-    @handler
-    async def handle_user_request(
-        self, request: PrimaryAgentRequest, ctx: WorkflowContext[ReviewRequest]
-    ) -> None:
-        """Handle initial user request with conversation history."""
-        print(f"[PrimaryAgent] Processing user request (ID: {request.request_id[:8]})")
-        logger.info(f"[PrimaryAgent] Processing user request (ID: {request.request_id[:8]})")
-
-        # Build message list with system prompt, history, and new user message
-        messages = [
-            ChatMessage(
-                role=Role.SYSTEM,
-                text=(
-                    "You are a helpful customer support assistant for Contoso company. "
-                    "You can help with billing, promotions, security, account information, and other customer inquiries. "
-                    "Use the available MCP tools to look up customer information, billing details, promotions, and security settings. "
-                    "When a customer provides an ID or asks about their account, use the tools to retrieve accurate, up-to-date information. "
-                    "Always be helpful, professional, and provide detailed information when available."
-                ),
-            )
-        ]
-        
-        # Add conversation history for context
-        messages.extend(request.conversation_history)
-        
-        # Add current user prompt
-        messages.append(ChatMessage(role=Role.USER, text=request.user_prompt))
-
-        print(f"[PrimaryAgent] Generating response with {len(messages)} messages in context")
-        logger.info(f"[PrimaryAgent] Generating response with {len(messages)} messages in context")
-
-        # Generate response
-        response = await self._chat_client.get_response(
-            messages=messages,
-            tools=self._tools,
-            model=self._model,
-        )
-
-        print(f"[PrimaryAgent] Response generated: {response.messages[-1].text[:100]}...")
-        logger.info(f"[PrimaryAgent] Response generated")
-
-        # Store full message context for potential retry
-        all_messages = messages + response.messages
-        self._pending_requests[request.request_id] = (request, all_messages)
-        
-        # Initialize refinement counter
-        if request.request_id not in self._refinement_counts:
-            self._refinement_counts[request.request_id] = 0
-
-        # Send to ReviewerAgent for evaluation
-        review_request = ReviewRequest(
-            request_id=request.request_id,
-            user_prompt=request.user_prompt,
-            conversation_history=request.conversation_history,
-            primary_agent_response=response.messages,
-        )
-        
-        print(f"[PrimaryAgent] Sending response to ReviewerAgent for evaluation")
-        logger.info(f"[PrimaryAgent] Sending response to ReviewerAgent for evaluation")
-        await ctx.send_message(review_request)
-
-    @handler
-    async def handle_review_feedback(
-        self, review: ReviewResponse, ctx: WorkflowContext[ReviewRequest]
-    ) -> None:
-        """Handle feedback from ReviewerAgent and regenerate if needed."""
-        print(f"[PrimaryAgent] Received review (ID: {review.request_id[:8]}) - Approved: {review.approved}")
-        logger.info(f"[PrimaryAgent] Received review (ID: {review.request_id[:8]}) - Approved: {review.approved}")
-
-        if review.request_id not in self._pending_requests:
-            logger.error(f"[PrimaryAgent] Unknown request ID: {review.request_id}")
-            raise ValueError(f"Unknown request ID in review: {review.request_id}")
-
-        original_request, messages = self._pending_requests.pop(review.request_id)
-
-        if review.approved:
-            print(f"[PrimaryAgent] Response approved! Sending to user via WorkflowAgent")
-            logger.info(f"[PrimaryAgent] Response approved")
-            
-            # Clean up refinement counter
-            self._refinement_counts.pop(review.request_id, None)
-            
-            # Extract contents from response to emit to user
-            # The WorkflowAgent will handle emitting this to the external consumer
-            # We don't send directly - ReviewerAgent will handle final emission
-            return
-
-        # Check if we've exceeded max refinements
-        current_count = self._refinement_counts.get(review.request_id, 0)
-        if current_count >= self._max_refinements:
-            print(f"[PrimaryAgent] Max refinements ({self._max_refinements}) reached. Force approving response.")
-            logger.warning(f"[PrimaryAgent] Max refinements reached for request {review.request_id[:8]}")
-            
-            # Clean up
-            self._refinement_counts.pop(review.request_id, None)
-            
-            # Force emit the last response even though not approved
-            # The ReviewerAgent already sent the ReviewResponse, so we're done
-            return
-
-        # Increment refinement counter
-        self._refinement_counts[review.request_id] = current_count + 1
-        
-        # Not approved - incorporate feedback and regenerate
-        print(f"[PrimaryAgent] Response not approved (attempt {current_count + 1}/{self._max_refinements}). Feedback: {review.feedback[:100]}...")
-        logger.info(f"[PrimaryAgent] Regenerating with feedback (attempt {current_count + 1}/{self._max_refinements})")
-
-        # Add feedback to message context
-        messages.append(
-            ChatMessage(
-                role=Role.SYSTEM,
-                text=f"REVIEWER FEEDBACK: {review.feedback}\n\nPlease improve your response based on this feedback.",
-            )
-        )
-        
-        # Add the original user prompt again for clarity
-        messages.append(ChatMessage(role=Role.USER, text=original_request.user_prompt))
-
-        # Regenerate response
-        response = await self._chat_client.get_response(
-            messages=messages,
-            tools=self._tools,
-            model=self._model,
-        )
-
-        print(f"[PrimaryAgent] New response generated: {response.messages[-1].text[:100]}...")
-        logger.info(f"[PrimaryAgent] New response generated")
-
-        # Update stored messages
-        messages.extend(response.messages)
-        self._pending_requests[review.request_id] = (original_request, messages)
-
-        # Send updated response for re-review
-        review_request = ReviewRequest(
-            request_id=review.request_id,
-            user_prompt=original_request.user_prompt,
-            conversation_history=original_request.conversation_history,
-            primary_agent_response=response.messages,
-        )
-        
-        print(f"[PrimaryAgent] Sending refined response to ReviewerAgent")
-        logger.info(f"[PrimaryAgent] Sending refined response to ReviewerAgent")
-        await ctx.send_message(review_request)
-
-
-class ReviewerAgentExecutor(Executor):
-    """
-    Reviewer Agent - Quality assurance gate.
-    Evaluates PrimaryAgent responses for accuracy, completeness, and professionalism.
-    Acts as conditional gate: approved responses go to user, rejected go back to PrimaryAgent.
-    """
-
-    def __init__(
-        self,
-        id: str,
-        chat_client: AzureOpenAIChatClient,
-        tools: MCPStreamableHTTPTool | None = None,
-        model: str | None = None,
-    ) -> None:
-        super().__init__(id=id)
-        self._chat_client = chat_client
-        self._tools = tools
-        self._model = model
-
-    @handler
-    async def review_response(
-        self, request: ReviewRequest, ctx: WorkflowContext[ReviewResponse]
-    ) -> None:
-        """
-        Review the PrimaryAgent's response and decide: approve or request edit.
-        Approved responses are emitted to user via AgentRunUpdateEvent.
-        Rejected responses are sent back to PrimaryAgent with feedback.
-        """
-        print(f"[ReviewerAgent] Evaluating response (ID: {request.request_id[:8]})")
-        logger.info(f"[ReviewerAgent] Evaluating response (ID: {request.request_id[:8]})")
-
-        # Build review context with conversation history
-        messages = [
-            ChatMessage(
-                role=Role.SYSTEM,
-                text=(
-                    "You are a quality assurance reviewer for customer support responses. "
-                    "Review the customer support agent's response for:\n"
-                    "1. Accuracy of information\n"
-                    "2. Completeness of answer\n"
-                    "3. Professional tone\n"
-                    "4. Proper use of available tools\n"
-                    "5. Clarity and helpfulness\n\n"
-                    "Be reasonable in your evaluation. If the response is professional, addresses the customer's question, "
-                    "and provides useful information, APPROVE it. Only reject if there are significant issues.\n\n"
-                    "Respond with a structured JSON containing:\n"
-                    "- approved: true if response meets quality standards (be reasonable), false only for major issues\n"
-                    "- feedback: constructive feedback (if not approved) or brief approval note"
-                ),
-            )
-        ]
-
-        # Add conversation history for context
-        messages.extend(request.conversation_history)
-
-        # Add the user's question
-        messages.append(ChatMessage(role=Role.USER, text=request.user_prompt))
-
-        # Add the agent's response
-        messages.extend(request.primary_agent_response)
-
-        # Add explicit review instruction
-        messages.append(
-            ChatMessage(
-                role=Role.USER,
-                text="Please review the agent's response above and provide your assessment.",
-            )
-        )
-
-        print(f"[ReviewerAgent] Sending review request to LLM")
-        logger.info(f"[ReviewerAgent] Sending review request to LLM")
-
-        # Get structured review decision
-        response = await self._chat_client.get_response(
-            messages=messages,
-            response_format=ReviewDecision,
-            tools=self._tools,
-            model=self._model,
-        )
-
-        # Parse decision
-        decision = ReviewDecision.model_validate_json(response.messages[-1].text)
-
-        print(f"[ReviewerAgent] Review decision - Approved: {decision.approved}")
-        if not decision.approved:
-            print(f"[ReviewerAgent] Feedback: {decision.feedback[:100]}...")
-        logger.info(f"[ReviewerAgent] Review decision - Approved: {decision.approved}")
-
-        if decision.approved:
-            # Emit approved response to external consumer (user)
-            print(f"[ReviewerAgent] Emitting approved response to user")
-            logger.info(f"[ReviewerAgent] Emitting approved response to user")
-            
-            contents: list[Contents] = []
-            for message in request.primary_agent_response:
-                contents.extend(message.contents)
-
-            await ctx.add_event(
-                AgentRunUpdateEvent(self.id, data=AgentRunResponseUpdate(contents=contents, role=Role.ASSISTANT))
-            )
-        else:
-            # Send feedback back to PrimaryAgent for refinement
-            print(f"[ReviewerAgent] Sending feedback to PrimaryAgent for refinement")
-            logger.info(f"[ReviewerAgent] Sending feedback to PrimaryAgent for refinement")
-
-        # Always send review response back to enable loop continuation
-        await ctx.send_message(
-            ReviewResponse(
-                request_id=request.request_id,
-                approved=decision.approved,
-                feedback=decision.feedback,
-            )
-        )
-
-
-class Agent(BaseAgent):
-    """
-    Workflow-based Reflection Agent implementation.
-    
-    Implements a 3-party communication pattern:
-    User -> PrimaryAgent -> ReviewerAgent -> User (if approved) OR back to PrimaryAgent (if not)
-    
-    Conversation history is maintained between user and PrimaryAgent only.
-    Both agents receive history for context.
-    """
-
-    def __init__(self, state_store: Dict[str, Any], session_id: str, access_token: str | None = None) -> None:
-        super().__init__(state_store, session_id)
-        self._workflow = None
-        self._initialized = False
-        self._access_token = access_token
-        self._ws_manager = None
-        self._mcp_tool = None  # Store connected MCP tool
-        
-        # Track conversation history as ChatMessage objects
-        self._conversation_history: list[ChatMessage] = []
-        self._load_conversation_history()
-        
-        print(f"WORKFLOW REFLECTION AGENT INITIALIZED - Session: {session_id}")
-        logger.info(f"WORKFLOW REFLECTION AGENT INITIALIZED - Session: {session_id}")
-
-    def _load_conversation_history(self) -> None:
-        """Load conversation history from state store and convert to ChatMessage format."""
-        chat_history = self.chat_history  # From BaseAgent
-        for msg in chat_history:
-            role = Role.USER if msg.get("role") == "user" else Role.ASSISTANT
-            text = msg.get("content", "")
-            self._conversation_history.append(ChatMessage(role=role, text=text))
-        
-        logger.info(f"Loaded {len(self._conversation_history)} messages from history")
-
-    def set_websocket_manager(self, manager: Any) -> None:
-        """Allow backend to inject WebSocket manager for streaming events."""
-        self._ws_manager = manager
-        logger.info(f"[STREAMING] WebSocket manager set for workflow reflection agent, session_id={self.session_id}")
-
-    async def _setup_workflow(self) -> None:
-        """Initialize the workflow with PrimaryAgent and ReviewerAgent executors."""
-        if self._initialized:
-            return
-
-        if not all([self.azure_openai_key, self.azure_deployment, self.azure_openai_endpoint, self.api_version]):
-            raise RuntimeError(
-                "Azure OpenAI configuration is incomplete. Ensure AZURE_OPENAI_API_KEY, "
-                "AZURE_OPENAI_CHAT_DEPLOYMENT, AZURE_OPENAI_ENDPOINT, and AZURE_OPENAI_API_VERSION are set."
-            )
-
-        print(f"[WORKFLOW] Setting up workflow agents...")
-        logger.info(f"[WORKFLOW] Setting up workflow agents")
-
-        # Setup MCP tools if configured (create only once)
-        if not self._mcp_tool:
-            headers = self._build_headers()
-            mcp_tools = await self._maybe_create_tools(headers)
-            self._mcp_tool = mcp_tools[0] if mcp_tools else None
-            
-            if self._mcp_tool:
-                print(f"[WORKFLOW] MCP tool created (will connect on first use)")
-                logger.info(f"[WORKFLOW] MCP tool created")
-
-        # Create Azure OpenAI chat client
-        chat_client = AzureOpenAIChatClient(
-            api_key=self.azure_openai_key,
-            deployment_name=self.azure_deployment,
-            endpoint=self.azure_openai_endpoint,
-            api_version=self.api_version,
-        )
-
-        # Create executors
-        primary_agent = PrimaryAgentExecutor(
-            id="primary_agent",
-            chat_client=chat_client,
-            tools=self._mcp_tool,
-            model=self.openai_model_name,
-        )
-
-        reviewer_agent = ReviewerAgentExecutor(
-            id="reviewer_agent",
-            chat_client=chat_client,
-            tools=self._mcp_tool,
-            model=self.openai_model_name,
-        )
-
-        print(f"[WORKFLOW] Building workflow graph: PrimaryAgent <-> ReviewerAgent")
-        logger.info(f"[WORKFLOW] Building workflow graph")
-
-        # Build workflow with bidirectional edges
-        self._workflow = (
-            WorkflowBuilder()
-            .add_edge(primary_agent, reviewer_agent)  # Primary -> Reviewer
-            .add_edge(reviewer_agent, primary_agent)  # Reviewer -> Primary (for feedback)
-            .set_start_executor(primary_agent)
-            .build()
-        )
-
-        self._initialized = True
-        print(f"[WORKFLOW] Workflow initialization complete")
-        logger.info(f"[WORKFLOW] Workflow initialization complete")
-
-    def _build_headers(self) -> Dict[str, str]:
-        """Build HTTP headers for MCP tool requests."""
-        headers = {"Content-Type": "application/json"}
-        if self._access_token:
-            headers["Authorization"] = f"Bearer {self._access_token}"
-        return headers
-
-    async def _maybe_create_tools(self, headers: Dict[str, str]) -> List[MCPStreamableHTTPTool] | None:
-        """Create MCP tools if server URI is configured."""
-        if not self.mcp_server_uri:
-            logger.warning("MCP_SERVER_URI not configured; agents run without MCP tools.")
-            return None
-        
-        print(f"[WORKFLOW] Creating MCP tools with server: {self.mcp_server_uri}")
-        return [
-            MCPStreamableHTTPTool(
-                name="mcp-streamable",
-                url=self.mcp_server_uri,
-                headers=headers,
-                timeout=30,
-                request_timeout=30,
-            )
-        ]
-
-    async def chat_async(self, prompt: str) -> str:
-        """
-        Process user prompt through the reflection workflow.
-        
-        Flow:
-        1. Create PrimaryAgentRequest with conversation history
-        2. PrimaryAgent generates response
-        3. ReviewerAgent evaluates response
-        4. If approved -> return to user
-        5. If not approved -> PrimaryAgent refines with feedback (loop continues)
-        """
-        print(f"WORKFLOW REFLECTION AGENT chat_async called with prompt: {prompt[:50]}...")
-        logger.info(f"WORKFLOW REFLECTION AGENT chat_async called with prompt: {prompt[:50]}...")
-
-        await self._setup_workflow()
-        if not self._workflow:
-            raise RuntimeError("Workflow not initialized correctly.")
-
-        # Create request with conversation history
-        request_id = str(uuid4())
-        request = PrimaryAgentRequest(
-            request_id=request_id,
-            user_prompt=prompt,
-            conversation_history=self._conversation_history.copy(),
-        )
-
-        print(f"[WORKFLOW] Starting workflow execution (Request ID: {request_id[:8]})")
-        logger.info(f"[WORKFLOW] Starting workflow execution")
-
-        # Run workflow (streaming or non-streaming based on ws_manager)
-        if self._ws_manager:
-            print(f"[WORKFLOW] Using STREAMING mode")
-            logger.info(f"[WORKFLOW] Using STREAMING mode")
-            response_text = await self._run_workflow_streaming(request)
-        else:
-            print(f"[WORKFLOW] Using NON-STREAMING mode")
-            logger.info(f"[WORKFLOW] Using NON-STREAMING mode")
-            response_text = await self._run_workflow(request)
-
-        # Update conversation history
-        self._conversation_history.append(ChatMessage(role=Role.USER, text=prompt))
-        self._conversation_history.append(ChatMessage(role=Role.ASSISTANT, text=response_text))
-
-        # Update chat history in base class format
-        messages = [
-            {"role": "user", "content": prompt},
-            {"role": "assistant", "content": response_text},
-        ]
-        self.append_to_chat_history(messages)
-
-        print(f"[WORKFLOW] Workflow execution complete")
-        logger.info(f"[WORKFLOW] Workflow execution complete")
-
-        return response_text
-
-    async def _run_workflow(self, request: PrimaryAgentRequest) -> str:
-        """Run workflow in non-streaming mode."""
-        # Run the workflow directly with the custom request
-        response = await self._workflow.run(request)
-        
-        # Extract text from the workflow result
-        response_text = response.output if hasattr(response, 'output') else str(response)
-        
-        print(f"[WORKFLOW] Response received: {response_text[:100]}...")
-        logger.info(f"[WORKFLOW] Response received")
-        
-        return response_text
-
-    async def _run_workflow_streaming(self, request: PrimaryAgentRequest) -> str:
-        """Run workflow in streaming mode with WebSocket updates."""
-        
-        # Notify UI that workflow is starting
-        if self._ws_manager:
-            await self._ws_manager.broadcast(
-                self.session_id,
-                {
-                    "type": "orchestrator",
-                    "kind": "plan",
-                    "content": "Workflow Reflection Pattern Starting\n\nInitiating PrimaryAgent → ReviewerAgent workflow for quality-assured responses...",
-                },
-            )
-
-        response_text = ""
-        
-        try:
-            async for event in self._workflow.run_stream(request):
-                # Handle different event types
-                event_str = str(event)
-                print(f"[WORKFLOW STREAM] Event: {event_str[:100]}...")
-                
-                # Check if this is an AgentRunUpdateEvent with approved response
-                if isinstance(event, AgentRunUpdateEvent):
-                    print(f"[WORKFLOW STREAM] AgentRunUpdateEvent detected from {event.executor_id}")
-                    
-                    # Extract response from the event data
-                    if hasattr(event, 'data') and isinstance(event.data, AgentRunResponseUpdate):
-                        # Extract text from contents
-                        for content in event.data.contents:
-                            if hasattr(content, 'text') and content.text:
-                                response_text += content.text
-                                
-                                # Stream to WebSocket
-                                if self._ws_manager:
-                                    await self._ws_manager.broadcast(
-                                        self.session_id,
-                                        {
-                                            "type": "agent_token",
-                                            "agent_id": "workflow_reflection",
-                                            "content": content.text,
-                                        },
-                                    )
-                                    
-                        print(f"[WORKFLOW STREAM] Extracted response text: {response_text[:100]}...")
-                
-                # Also check for text attribute directly on event
-                elif hasattr(event, 'text') and event.text:
-                    response_text += event.text
-                    
-                    # Stream to WebSocket
-                    if self._ws_manager:
-                        await self._ws_manager.broadcast(
-                            self.session_id,
-                            {
-                                "type": "agent_token",
-                                "agent_id": "workflow_reflection",
-                                "content": event.text,
-                            },
-                        )
-                
-                # Check for messages attribute
-                elif hasattr(event, 'messages'):
-                    for msg in event.messages:
-                        if hasattr(msg, 'text') and msg.text:
-                            response_text = msg.text
-
-            # Send final result
-            if self._ws_manager and response_text:
-                await self._ws_manager.broadcast(
-                    self.session_id,
-                    {
-                        "type": "final_result",
-                        "content": response_text,
-                    },
-                )
-                
-                await self._ws_manager.broadcast(
-                    self.session_id,
-                    {
-                        "type": "orchestrator",
-                        "kind": "result",
-                        "content": "Workflow Complete\n\nQuality-assured response delivered through PrimaryAgent → ReviewerAgent workflow!",
-                    },
-                )
-
-        except Exception as exc:
-            logger.error(f"[WORKFLOW] Error during streaming: {exc}", exc_info=True)
-            raise
-
-        print(f"[WORKFLOW STREAM] Complete. Response length: {len(response_text)}")
-        logger.info(f"[WORKFLOW STREAM] Complete")
-        
-        return response_text
diff --git a/agentic_ai/agents/agent_framework/multi_agent/test_reflection_workflow_agent.py b/agentic_ai/agents/agent_framework/multi_agent/test_reflection_workflow_agent.py
deleted file mode 100644
index 072bd8985..000000000
--- a/agentic_ai/agents/agent_framework/multi_agent/test_reflection_workflow_agent.py
+++ /dev/null
@@ -1,226 +0,0 @@
-"""
-Test script for the Workflow-based Reflection Agent
-
-This script demonstrates the 3-party communication pattern:
-User -> PrimaryAgent -> ReviewerAgent -> User (if approved) OR back to PrimaryAgent (if not)
-
-Usage:
-    python test_reflection_workflow_agent.py
-"""
-
-import asyncio
-import logging
-import os
-from typing import Dict, Any
-
-# Setup logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-
-logger = logging.getLogger(__name__)
-
-
-async def test_workflow_reflection_agent():
-    """Test the workflow-based reflection agent."""
-    
-    print("=" * 70)
-    print("WORKFLOW REFLECTION AGENT TEST")
-    print("=" * 70)
-    print()
-    
-    # Check environment variables
-    required_env_vars = [
-        "AZURE_OPENAI_API_KEY",
-        "AZURE_OPENAI_CHAT_DEPLOYMENT",
-        "AZURE_OPENAI_ENDPOINT",
-        "AZURE_OPENAI_API_VERSION",
-        "OPENAI_MODEL_NAME",
-    ]
-    
-    print("Checking environment variables...")
-    missing_vars = [var for var in required_env_vars if not os.getenv(var)]
-    if missing_vars:
-        print(f"❌ Missing environment variables: {', '.join(missing_vars)}")
-        print("\nPlease set the following environment variables:")
-        for var in missing_vars:
-            print(f"  - {var}")
-        return
-    
-    print("✓ All required environment variables are set")
-    print()
-    
-    # Optional MCP server
-    mcp_uri = os.getenv("MCP_SERVER_URI")
-    if mcp_uri:
-        print(f"✓ MCP Server configured: {mcp_uri}")
-    else:
-        print("ℹ MCP Server not configured (agents will work without MCP tools)")
-    print()
-    
-    # Import the agent (after env check to avoid import errors)
-    try:
-        from agentic_ai.agents.agent_framework.multi_agent.reflection_workflow_agent import Agent
-    except ImportError as e:
-        print(f"❌ Failed to import Agent: {e}")
-        print("\nMake sure you're running from the project root directory:")
-        print("  python agentic_ai/agents/agent_framework/multi_agent/test_reflection_workflow_agent.py")
-        return
-    
-    # Create state store and agent
-    state_store: Dict[str, Any] = {}
-    session_id = "test_session_001"
-    
-    print(f"Creating Workflow Reflection Agent (Session: {session_id})...")
-    agent = Agent(state_store=state_store, session_id=session_id)
-    print("✓ Agent created successfully")
-    print()
-    
-    # Test queries
-    test_queries = [
-        "What is the capital of France?",
-        "Can you help me with customer ID 1?",
-    ]
-    
-    for i, query in enumerate(test_queries, 1):
-        print("=" * 70)
-        print(f"TEST QUERY {i}: {query}")
-        print("=" * 70)
-        print()
-        
-        try:
-            print(f"Sending query to agent...")
-            print(f"Expected flow: User -> PrimaryAgent -> ReviewerAgent -> (approve/reject)")
-            print()
-            
-            response = await agent.chat_async(query)
-            
-            print()
-            print("-" * 70)
-            print("FINAL RESPONSE:")
-            print("-" * 70)
-            print(response)
-            print()
-            
-            print("✓ Query completed successfully")
-            print()
-            
-        except Exception as e:
-            print(f"❌ Error during query: {e}")
-            logger.error(f"Error during query: {e}", exc_info=True)
-            print()
-    
-    print("=" * 70)
-    print("TEST COMPLETE")
-    print("=" * 70)
-    print()
-    print("Summary:")
-    print(f"- Total queries tested: {len(test_queries)}")
-    print(f"- Session ID: {session_id}")
-    print(f"- Conversation history entries: {len(state_store.get(f'{session_id}_chat_history', []))}")
-    print()
-    print("Key features demonstrated:")
-    print("  ✓ 3-party communication pattern (User -> PrimaryAgent -> ReviewerAgent)")
-    print("  ✓ Conditional gate (approve/reject)")
-    print("  ✓ Conversation history maintenance")
-    print("  ✓ Iterative refinement loop")
-    print()
-
-
-async def test_with_mcp_tools():
-    """Test with actual MCP tools if configured."""
-    
-    print("=" * 70)
-    print("WORKFLOW REFLECTION AGENT TEST WITH MCP TOOLS")
-    print("=" * 70)
-    print()
-    
-    if not os.getenv("MCP_SERVER_URI"):
-        print("⚠ MCP_SERVER_URI not configured. Skipping MCP test.")
-        print("To test with MCP tools, set the MCP_SERVER_URI environment variable.")
-        return
-    
-    # Import the agent
-    try:
-        from agentic_ai.agents.agent_framework.multi_agent.reflection_workflow_agent import Agent
-    except ImportError as e:
-        print(f"❌ Failed to import Agent: {e}")
-        return
-    
-    # Create state store and agent
-    state_store: Dict[str, Any] = {}
-    session_id = "test_session_mcp_001"
-    
-    print(f"Creating Workflow Reflection Agent with MCP tools (Session: {session_id})...")
-    agent = Agent(state_store=state_store, session_id=session_id)
-    print("✓ Agent created successfully")
-    print()
-    
-    # Test MCP-specific queries
-    mcp_queries = [
-        "Can you list all customers?",
-        "What are the billing details for customer ID 1?",
-        "What promotions are available for customer 1?",
-    ]
-    
-    for i, query in enumerate(mcp_queries, 1):
-        print("=" * 70)
-        print(f"MCP TEST QUERY {i}: {query}")
-        print("=" * 70)
-        print()
-        
-        try:
-            print(f"Sending query to agent (expects MCP tool usage)...")
-            print(f"Expected: PrimaryAgent will use MCP tools, ReviewerAgent will verify accuracy")
-            print()
-            
-            response = await agent.chat_async(query)
-            
-            print()
-            print("-" * 70)
-            print("FINAL RESPONSE:")
-            print("-" * 70)
-            print(response)
-            print()
-            
-            print("✓ MCP query completed successfully")
-            print()
-            
-        except Exception as e:
-            print(f"❌ Error during MCP query: {e}")
-            logger.error(f"Error during MCP query: {e}", exc_info=True)
-            print()
-    
-    print("=" * 70)
-    print("MCP TEST COMPLETE")
-    print("=" * 70)
-
-
-def main():
-    """Main entry point."""
-    print()
-    print("╔═══════════════════════════════════════════════════════════════════╗")
-    print("║     WORKFLOW-BASED REFLECTION AGENT TEST SUITE                   ║")
-    print("╚═══════════════════════════════════════════════════════════════════╝")
-    print()
-    
-    # Run basic test
-    asyncio.run(test_workflow_reflection_agent())
-    
-    print()
-    print("-" * 70)
-    print()
-    
-    # Run MCP test if configured
-    asyncio.run(test_with_mcp_tools())
-    
-    print()
-    print("╔═══════════════════════════════════════════════════════════════════╗")
-    print("║     ALL TESTS COMPLETE                                            ║")
-    print("╚═══════════════════════════════════════════════════════════════════╝")
-    print()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/agentic_ai/agents/agent_framework/single_agent.py b/agentic_ai/agents/agent_framework/single_agent.py
index 5b2527031..ae0a4c178 100644
--- a/agentic_ai/agents/agent_framework/single_agent.py
+++ b/agentic_ai/agents/agent_framework/single_agent.py
@@ -1,16 +1,15 @@
-import json
 import logging
 from typing import Any, Dict, List
 
 from agent_framework import AgentThread, ChatAgent, MCPStreamableHTTPTool
 from agent_framework.azure import AzureOpenAIChatClient
 
-from agents.base_agent import BaseAgent
+from agents.base_agent import BaseAgent, ToolCallTrackingMixin
 
 logger = logging.getLogger(__name__)
 
 
-class Agent(BaseAgent):
+class Agent(ToolCallTrackingMixin, BaseAgent):
     """Agent Framework implementation of a single assistant loop."""
 
     def __init__(self, state_store: Dict[str, Any], session_id: str, access_token: str | None = None) -> None:
@@ -23,6 +22,8 @@ def __init__(self, state_store: Dict[str, Any], session_id: str, access_token: s
         # Track conversation turn for tool call grouping - load from state store
         self._turn_key = f"{session_id}_current_turn"
         self._current_turn = state_store.get(self._turn_key, 0)
+        # Initialize tool tracking from mixin
+        self.init_tool_tracking()
 
     def set_websocket_manager(self, manager: Any) -> None:
         """Allow backend to inject WebSocket manager for streaming events."""
@@ -148,12 +149,17 @@ async def _log_mcp_tool_details(self) -> None:
             logger.debug("No tools returned from MCP server during inspection.")
             return
 
+    # get_tool_calls() is inherited from ToolCallTrackingMixin
+
     async def chat_async(self, prompt: str) -> str:
         await self._setup_single_agent()
 
         if not self._agent or not self._thread:
             raise RuntimeError("Agent Framework single agent failed to initialize correctly.")
 
+        # Clear tool calls from previous request (from mixin)
+        self.clear_tool_calls()
+
         # Increment turn counter for this new conversation turn and persist to state store
         self._current_turn += 1
         self.state_store[self._turn_key] = self._current_turn
@@ -162,9 +168,37 @@ async def chat_async(self, prompt: str) -> str:
         if self._ws_manager:
             return await self._chat_async_streaming(prompt)
         
-        # Non-streaming path
-        response = await self._agent.run(prompt, thread=self._thread)
-        assistant_response = response.text
+        # Non-streaming path - use run_stream to capture tool calls
+        full_response = []
+        async for chunk in self._agent.run_stream(prompt, thread=self._thread):
+            # Extract tool calls from contents
+            if hasattr(chunk, 'contents') and chunk.contents:
+                for content in chunk.contents:
+                    if content.type == "function_call":
+                        # Function call chunks come in pieces:
+                        # 1. First chunk has name, empty arguments
+                        # 2. Subsequent chunks have no name, partial arguments
+                        if content.name:
+                            # New function call starting - finalize previous if any
+                            self.track_function_call_start(content.name)
+                        
+                        # Accumulate arguments
+                        args_chunk = getattr(content, 'arguments', '')
+                        if args_chunk:
+                            self.track_function_call_arguments(args_chunk)
+                    
+                    elif content.type == "function_result":
+                        # Function result means the call is complete
+                        self.finalize_tool_tracking()
+            
+            # Extract text
+            if hasattr(chunk, 'text') and chunk.text:
+                full_response.append(chunk.text)
+        
+        # Finalize any remaining function call
+        self.finalize_tool_tracking()
+        
+        assistant_response = ''.join(full_response)
 
         messages = [
             {"role": "user", "content": prompt},
@@ -192,7 +226,7 @@ async def _chat_async_streaming(self, prompt: str) -> str:
                     "show_message_in_internal_process": False,  # Convention: don't show message in left panel
                 },
             )
-
+        
         # Stream the response
         full_response = []
         
@@ -201,18 +235,32 @@ async def _chat_async_streaming(self, prompt: str) -> str:
                 # Process contents in the chunk
                 if hasattr(chunk, 'contents') and chunk.contents:
                     for content in chunk.contents:
-                        # Check for tool/function calls - only broadcast the tool name
+                        # Handle function calls - accumulate arguments across chunks
                         if content.type == "function_call":
-                            if self._ws_manager:
-                                await self._ws_manager.broadcast(
-                                    self.session_id,
-                                    {
-                                        "type": "tool_called",
-                                        "agent_id": "single_agent",
-                                        "tool_name": content.name,
-                                        "turn": self._current_turn,
-                                    },
-                                )
+                            if content.name:
+                                # New function call - finalize previous and start new
+                                self.track_function_call_start(content.name)
+                                
+                                # Broadcast that a tool is being called
+                                if self._ws_manager:
+                                    await self._ws_manager.broadcast(
+                                        self.session_id,
+                                        {
+                                            "type": "tool_called",
+                                            "agent_id": "single_agent",
+                                            "tool_name": content.name,
+                                            "turn": self._current_turn,
+                                        },
+                                    )
+                            
+                            # Accumulate arguments
+                            args_chunk = getattr(content, 'arguments', '')
+                            if args_chunk:
+                                self.track_function_call_arguments(args_chunk)
+                        
+                        elif content.type == "function_result":
+                            # Function completed - finalize
+                            self.finalize_tool_tracking()
                 
                 # Extract text from chunk
                 if hasattr(chunk, 'text') and chunk.text:
@@ -231,6 +279,9 @@ async def _chat_async_streaming(self, prompt: str) -> str:
         except Exception as exc:
             logger.error("[STREAMING] Error during single agent streaming: %s", exc, exc_info=True)
             raise
+        
+        # Finalize any remaining function call
+        self.finalize_tool_tracking()
 
         assistant_response = ''.join(full_response)
 
diff --git a/agentic_ai/agents/base_agent.py b/agentic_ai/agents/base_agent.py
index fb8bbd6f9..7380b4ff1 100644
--- a/agentic_ai/agents/base_agent.py
+++ b/agentic_ai/agents/base_agent.py
@@ -1,4 +1,5 @@
 import os  
+import json
 import logging  
 from typing import Any, Dict, List, Optional, Union  
 from dotenv import load_dotenv  
@@ -7,7 +8,96 @@
 from azure.core.credentials import TokenCredential
 
 load_dotenv()  # Load environment variables from .env file if needed  
-  
+
+
+class ToolCallTrackingMixin:
+    """
+    Mixin class that provides tool call tracking functionality.
+    
+    Use this mixin in agents that need to track tool calls for evaluation.
+    The mixin handles:
+    - Accumulating streaming function call arguments
+    - Finalizing function calls with parsed arguments
+    - Providing access to tool calls made during a request
+    
+    Usage:
+        class MyAgent(ToolCallTrackingMixin, BaseAgent):
+            def __init__(self, state_store, session_id):
+                super().__init__(state_store, session_id)
+                self.init_tool_tracking()  # Call this in __init__
+    """
+    
+    def init_tool_tracking(self) -> None:
+        """Initialize tool tracking state. Call this in agent's __init__."""
+        self._tool_calls: List[Dict[str, Any]] = []
+        self._current_function_call: Dict[str, Any] | None = None
+        self._current_function_args: List[str] = []
+    
+    def clear_tool_calls(self) -> None:
+        """Clear tool calls from previous request. Call at start of chat_async."""
+        self._tool_calls = []
+        self._current_function_call = None
+        self._current_function_args = []
+    
+    def get_tool_calls(self) -> List[Dict[str, Any]]:
+        """Return the list of tool calls made during the last request.
+        
+        Returns list of dicts with:
+        - name: tool name
+        - args: arguments passed to the tool
+        """
+        return self._tool_calls.copy()
+    
+    def track_function_call_start(self, name: str) -> None:
+        """Start tracking a new function call. Call when function_call content is received."""
+        # Finalize any previous function call first
+        self._finalize_current_function_call()
+        self._current_function_call = {"name": name}
+        self._current_function_args = []
+    
+    def track_function_call_arguments(self, arguments: str) -> None:
+        """Accumulate streaming function call arguments."""
+        if arguments:
+            self._current_function_args.append(arguments)
+    
+    def _finalize_current_function_call(self) -> None:
+        """Finalize the current function call by parsing accumulated arguments."""
+        if self._current_function_call is None:
+            return
+        
+        # Join accumulated argument chunks
+        args_str = ''.join(self._current_function_args)
+        
+        # Parse the arguments
+        args = {}
+        if args_str:
+            try:
+                args = json.loads(args_str)
+            except json.JSONDecodeError:
+                # If JSON parsing fails, store raw string
+                args = {"_raw": args_str} if args_str.strip() else {}
+        
+        self._tool_calls.append({
+            "name": self._current_function_call["name"],
+            "args": args
+        })
+        
+        # Reset accumulators
+        self._current_function_call = None
+        self._current_function_args = []
+    
+    def finalize_tool_tracking(self) -> None:
+        """Finalize any pending function calls. Call at end of streaming."""
+        self._finalize_current_function_call()
+    
+    def add_tool_call(self, name: str, args: Dict[str, Any] | None = None) -> None:
+        """Directly add a tool call (for non-streaming scenarios)."""
+        self._tool_calls.append({
+            "name": name,
+            "args": args or {}
+        })
+
+
 class BaseAgent:  
     """  
     Base class for all agents.  
diff --git a/agentic_ai/applications/backend.py b/agentic_ai/applications/backend.py
index 56cf293ef..85cddef08 100644
--- a/agentic_ai/applications/backend.py
+++ b/agentic_ai/applications/backend.py
@@ -286,7 +286,8 @@ class ChatRequest(BaseModel):
   
   
 class ChatResponse(BaseModel):  
-    response: str  
+    response: str
+    tools_used: List[Dict[str, Any]] = []  # List of {name: str, args: dict}  
   
   
 class ConversationHistoryResponse(BaseModel):  
@@ -325,8 +326,16 @@ async def chat(req: ChatRequest, token: str = Depends(verify_token)):
         agent = Agent(STATE_STORE, req.session_id, access_token=token)
     except TypeError:
         agent = Agent(STATE_STORE, req.session_id)
-    answer = await agent.chat_async(req.prompt)  
-    return ChatResponse(response=answer)  
+    answer = await agent.chat_async(req.prompt)
+    
+    # Get tool calls if the agent tracks them
+    tools_used = []
+    if hasattr(agent, 'get_tool_calls'):
+        tools_used = agent.get_tool_calls()
+    elif hasattr(agent, '_tool_calls'):
+        tools_used = agent._tool_calls
+    
+    return ChatResponse(response=answer, tools_used=tools_used)  
   
 @app.post("/reset_session")  
 async def reset_session(req: SessionResetRequest, token: str = Depends(verify_token)):  
diff --git a/agentic_ai/applications/pyproject.toml b/agentic_ai/applications/pyproject.toml
index dad4fd577..c8f6dfa01 100644
--- a/agentic_ai/applications/pyproject.toml
+++ b/agentic_ai/applications/pyproject.toml
@@ -5,9 +5,9 @@ description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
-    "agent-framework==1.0.0b260107",
-    "autogen-agentchat==0.7.1",
-    "autogen-ext[mcp]==0.7.1",
+    "agent-framework==1.0.0b260130",
+    "azure-ai-evaluation>=1.14.0",
+    "azure-ai-projects>=2.0.0b2",
     "azure-cosmos==4.9.0",
     "fastapi==0.115.12",
     "flasgger==0.9.7.1",
@@ -26,3 +26,12 @@ dependencies = [
 
 [tool.uv]
 prerelease = "allow"
+
+[dependency-groups]
+dev = [
+    "azure-identity>=1.26.0b1",
+    "azure-keyvault-secrets>=4.10.0",
+    "pytest>=9.0.2",
+    "pytest-asyncio>=1.3.0",
+    "pytest-timeout>=2.4.0",
+]
diff --git a/agentic_ai/applications/uv.lock b/agentic_ai/applications/uv.lock
index 5fff23e52..022275724 100644
--- a/agentic_ai/applications/uv.lock
+++ b/agentic_ai/applications/uv.lock
@@ -2,7 +2,8 @@ version = 1
 revision = 1
 requires-python = ">=3.12"
 resolution-markers = [
-    "python_full_version >= '3.13'",
+    "python_full_version >= '3.14'",
+    "python_full_version == '3.13.*'",
     "python_full_version < '3.13'",
 ]
 
@@ -39,14 +40,14 @@ wheels = [
 
 [[package]]
 name = "agent-framework"
-version = "1.0.0b260107"
+version = "1.0.0b260130"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "agent-framework-core", extra = ["all"] },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/7e/e7/5ad52075da4e586ca94fb8806b3085ac5dea8059413e413bff88c0452e88/agent_framework-1.0.0b260107.tar.gz", hash = "sha256:a2f6508a0ca1df3b7ca4e3a64e45bac8e33cdfe02cf69e9056e37e881a58aad7", size = 2898189 }
+sdist = { url = "https://files.pythonhosted.org/packages/93/10/ba51bf04ea2900897a221664e4e673dcc7a7a58a6658eeb85115e920d9b4/agent_framework-1.0.0b260130.tar.gz", hash = "sha256:50e13b74366b8092cb81769f07b3b42d6ddc8888a51244933c3214df591b7108", size = 3506765 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/8f/55/ffef27526cc26bf163ccf9d58ba87bf4e677bba343a542e7b666846f744d/agent_framework-1.0.0b260107-py3-none-any.whl", hash = "sha256:080deb32bff4ef07227a4ba709798c67079ff8a2997fe7a0aed0010adc0c18cf", size = 5554 },
+    { url = "https://files.pythonhosted.org/packages/bb/3d/2a8efa9085c7fec503a64038f986faf0cdf7f5de853c4ae30724e2e2bda6/agent_framework-1.0.0b260130-py3-none-any.whl", hash = "sha256:b9ba1487f91ab22031e01b5c09e5649181fd717f807d94f22ec43a409c43cde1", size = 5552 },
 ]
 
 [[package]]
@@ -160,7 +161,7 @@ wheels = [
 
 [[package]]
 name = "agent-framework-core"
-version = "1.0.0b260107"
+version = "1.0.0b260130"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "azure-identity" },
@@ -174,9 +175,9 @@ dependencies = [
     { name = "pydantic-settings" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/9d/44/06f5d2c99dd7bdb82c2cb5cbc354b5bc6af72d1886d20eff1dff83508fae/agent_framework_core-1.0.0b260107.tar.gz", hash = "sha256:12636fb64664c6153546f0d85dafccdbe57226767c14b3f38985867389f980bb", size = 3574757 }
+sdist = { url = "https://files.pythonhosted.org/packages/4d/39/e508e778219bd6d20e023a6f48235861a639e3cf888776f9e873bbad3c6b/agent_framework_core-1.0.0b260130.tar.gz", hash = "sha256:030a5b2ced796eec6839c2dabad90b4bd1ea33d1026f3ed1813050a56ccfa4ec", size = 301823 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/1e/5a/8c6315a2ca119ad48340344616d4b8e77fd68e2892f82c402069a52ad647/agent_framework_core-1.0.0b260107-py3-none-any.whl", hash = "sha256:5bd119b8d30dc2d5bee1c4a5c3597d7afc808a52e4de148725c4f2d9bcc7632b", size = 5687298 },
+    { url = "https://files.pythonhosted.org/packages/36/68/afe66c72951a279e0fe048fd5af1e775528cde40dbdab8ec03b42c545df4/agent_framework_core-1.0.0b260130-py3-none-any.whl", hash = "sha256:75b4dd0ca2ae52574d406cf5c9ed7adf63e187379f72fce891743254d83dfd56", size = 348724 },
 ]
 
 [package.optional-dependencies]
@@ -191,6 +192,8 @@ all = [
     { name = "agent-framework-copilotstudio" },
     { name = "agent-framework-declarative" },
     { name = "agent-framework-devui" },
+    { name = "agent-framework-durabletask" },
+    { name = "agent-framework-github-copilot" },
     { name = "agent-framework-lab" },
     { name = "agent-framework-mem0" },
     { name = "agent-framework-ollama" },
@@ -227,6 +230,34 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/53/21/359592eda88e88d8215a5941d120935588bbb7454336514c5353b4ae6240/agent_framework_devui-1.0.0b251114-py3-none-any.whl", hash = "sha256:75657a4b14de5271c587d5ef130d7c031b5936785c3283e16f66b871b1ffa278", size = 338108 },
 ]
 
+[[package]]
+name = "agent-framework-durabletask"
+version = "1.0.0b260130"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "agent-framework-core" },
+    { name = "durabletask" },
+    { name = "durabletask-azuremanaged" },
+    { name = "python-dateutil" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e3/95/9d5ee7fd1fdcd52c10aa1b2902964701d1d62b9d35cc7d05115b90db6329/agent_framework_durabletask-1.0.0b260130.tar.gz", hash = "sha256:63a2c8e0968a51d8e132892e9d385d2b82ccb95263d2c0316dc46b0eaa4dd7a4", size = 30285 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ba/22/122ed515935926137cc3c6ca795ef01b30feb82160cfc0f29a34f9d603de/agent_framework_durabletask-1.0.0b260130-py3-none-any.whl", hash = "sha256:a46e292800d10a62ce0923efe753594ddbf0bd6d1bb6e1258380f0dbf7d0302f", size = 36357 },
+]
+
+[[package]]
+name = "agent-framework-github-copilot"
+version = "1.0.0b260130"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "agent-framework-core" },
+    { name = "github-copilot-sdk" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ae/00/f69d731db02e256b8d18d6d8cd20d3d0684245df876f22b836743403a9c1/agent_framework_github_copilot-1.0.0b260130.tar.gz", hash = "sha256:3f5f231785bc8e663da2d1db65a5e4ee49a0f6266e31cccbf3ef05a79ab6c90d", size = 7929 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/13/b8/0a09396682e915dc25dc39c69fc06cc199b9901ccb0fdbb5e9e2886d2cb0/agent_framework_github_copilot-1.0.0b260130-py3-none-any.whl", hash = "sha256:b8844bacbf666ff1ea7f27d34a42c11be4ade1c4d57e7545341bb74462d82703", size = 8752 },
+]
+
 [[package]]
 name = "agent-framework-lab"
 version = "1.0.0b251024"
@@ -465,6 +496,8 @@ version = "0.1.0"
 source = { virtual = "." }
 dependencies = [
     { name = "agent-framework" },
+    { name = "azure-ai-evaluation" },
+    { name = "azure-ai-projects" },
     { name = "azure-cosmos" },
     { name = "fastapi" },
     { name = "flasgger" },
@@ -481,9 +514,20 @@ dependencies = [
     { name = "websockets" },
 ]
 
+[package.dev-dependencies]
+dev = [
+    { name = "azure-identity" },
+    { name = "azure-keyvault-secrets" },
+    { name = "pytest" },
+    { name = "pytest-asyncio" },
+    { name = "pytest-timeout" },
+]
+
 [package.metadata]
 requires-dist = [
-    { name = "agent-framework", specifier = "==1.0.0b260107" },
+    { name = "agent-framework", specifier = "==1.0.0b260130" },
+    { name = "azure-ai-evaluation", specifier = ">=1.14.0" },
+    { name = "azure-ai-projects", specifier = ">=2.0.0b2" },
     { name = "azure-cosmos", specifier = "==4.9.0" },
     { name = "fastapi", specifier = "==0.115.12" },
     { name = "flasgger", specifier = "==0.9.7.1" },
@@ -500,6 +544,24 @@ requires-dist = [
     { name = "websockets", specifier = ">=15.0.1" },
 ]
 
+[package.metadata.requires-dev]
+dev = [
+    { name = "azure-identity", specifier = ">=1.26.0b1" },
+    { name = "azure-keyvault-secrets", specifier = ">=4.10.0" },
+    { name = "pytest", specifier = ">=9.0.2" },
+    { name = "pytest-asyncio", specifier = ">=1.3.0" },
+    { name = "pytest-timeout", specifier = ">=2.4.0" },
+]
+
+[[package]]
+name = "asyncio"
+version = "4.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/71/ea/26c489a11f7ca862d5705db67683a7361ce11c23a7b98fc6c2deaeccede2/asyncio-4.0.0.tar.gz", hash = "sha256:570cd9e50db83bc1629152d4d0b7558d6451bb1bfd5dfc2e935d96fc2f40329b", size = 5371 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/57/64/eff2564783bd650ca25e15938d1c5b459cda997574a510f7de69688cb0b4/asyncio-4.0.0-py3-none-any.whl", hash = "sha256:c1eddb0659231837046809e68103969b2bef8b0400d59cfa6363f6b5ed8cc88b", size = 5555 },
+]
+
 [[package]]
 name = "attrs"
 version = "25.4.0"
@@ -523,6 +585,29 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6d/6d/15070d23d7a94833a210da09d5d7ed3c24838bb84f0463895e5d159f1695/azure_ai_agents-1.2.0b5-py3-none-any.whl", hash = "sha256:257d0d24a6bf13eed4819cfa5c12fb222e5908deafb3cbfd5711d3a511cc4e88", size = 217948 },
 ]
 
+[[package]]
+name = "azure-ai-evaluation"
+version = "1.14.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohttp" },
+    { name = "azure-core" },
+    { name = "azure-identity" },
+    { name = "azure-storage-blob" },
+    { name = "httpx" },
+    { name = "jinja2" },
+    { name = "msrest" },
+    { name = "nltk" },
+    { name = "openai" },
+    { name = "pandas" },
+    { name = "pyjwt" },
+    { name = "ruamel-yaml" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/66/ab/62300008df848b210ef2a21b646480eee7c1bf3906afdc1351795343321c/azure_ai_evaluation-1.14.0.tar.gz", hash = "sha256:2a5681805b7cde65ad663f34d0f647d28498dd9395f7e2ce0789320c26664dae", size = 2196726 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/96/68/1e0bd2123a5e681dbe474a3dda098c85704556e53ae24c7f4b3915d4e048/azure_ai_evaluation-1.14.0-py3-none-any.whl", hash = "sha256:1785f9be28517839ab9d30a03893951f7c9b530500d939d0ae51dde3aa1478b0", size = 1141136 },
+]
+
 [[package]]
 name = "azure-ai-projects"
 version = "2.0.0b2"
@@ -619,6 +704,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e1/28/af9ef022f21e3b51b3718d4348f771b490678c1116563895547c0a771362/azure_identity-1.26.0b1-py3-none-any.whl", hash = "sha256:dc608b59ae628a38611208ee761adeb1a2b9390258b58d6edcda2d24c50a4348", size = 197227 },
 ]
 
+[[package]]
+name = "azure-keyvault-secrets"
+version = "4.10.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "azure-core" },
+    { name = "isodate" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/97/e5/3074e581b6e8923c4a1f2e42192ea6f390bb52de3600c68baaaed529ef05/azure_keyvault_secrets-4.10.0.tar.gz", hash = "sha256:666fa42892f9cee749563e551a90f060435ab878977c95265173a8246d546a36", size = 129695 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/26/94/7c902e966b28e7cb5080a8e0dd6bffc22ba44bc907f09c4c633d2b7c4f6a/azure_keyvault_secrets-4.10.0-py3-none-any.whl", hash = "sha256:9dbde256077a4ee1a847646671580692e3f9bea36bcfc189c3cf2b9a94eb38b9", size = 125237 },
+]
+
 [[package]]
 name = "azure-search-documents"
 version = "11.7.0b2"
@@ -816,7 +915,7 @@ name = "clr-loader"
 version = "0.2.10"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "cffi" },
+    { name = "cffi", marker = "python_full_version < '3.14'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/18/24/c12faf3f61614b3131b5c98d3bf0d376b49c7feaa73edca559aeb2aee080/clr_loader-0.2.10.tar.gz", hash = "sha256:81f114afbc5005bafc5efe5af1341d400e22137e275b042a8979f3feb9fc9446", size = 83605 }
 wheels = [
@@ -885,6 +984,34 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/55/e2/2537ebcff11c1ee1ff17d8d0b6f4db75873e3b0fb32c2d4a2ee31ecb310a/docstring_parser-0.17.0-py3-none-any.whl", hash = "sha256:cf2569abd23dce8099b300f9b4fa8191e9582dda731fd533daf54c4551658708", size = 36896 },
 ]
 
+[[package]]
+name = "durabletask"
+version = "1.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "asyncio" },
+    { name = "grpcio" },
+    { name = "packaging" },
+    { name = "protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c9/27/3d021e6b36fc1aab6099fafc56dfc8059b4e8968615a26c1a0418601e50a/durabletask-1.3.0.tar.gz", hash = "sha256:11e38dda6df4737fadca0c71fc0a0f769955877c8a8bdb25ccbf90cf45afbf63", size = 57830 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/44/87/31ea460dbfaf50d9877f143e2ce9829cac2fb106747d9900cc353356ea77/durabletask-1.3.0-py3-none-any.whl", hash = "sha256:411f23e13391b8845edca010873dd7a87ee7cfc1fe05753ab28a7cd7c3c1bd77", size = 64112 },
+]
+
+[[package]]
+name = "durabletask-azuremanaged"
+version = "1.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "azure-identity" },
+    { name = "durabletask" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/29/29/6bb0b5fe51aa92e117adcdc93efe97cf5476d86c1496e5c5ab35d99a8d07/durabletask_azuremanaged-1.3.0.tar.gz", hash = "sha256:55172588e075afa80d46dcc2e5ddbd84be0a20cc78c74f687040c3720677d34c", size = 4343 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/08/11/4d34fec302c4813e626080f1532d189767eb31d6d80e8f3698c230512f14/durabletask_azuremanaged-1.3.0-py3-none-any.whl", hash = "sha256:9da914f569da1597c858d494a95eda37e4372726c0ee65f30080dcafab262d60", size = 6366 },
+]
+
 [[package]]
 name = "fastapi"
 version = "0.115.12"
@@ -1043,6 +1170,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a0/61/5c78b91c3143ed5c14207f463aecfc8f9dbb5092fb2869baf37c273b2705/gitdb-4.0.12-py3-none-any.whl", hash = "sha256:67073e15955400952c6565cc3e707c554a4eea2e428946f7a4c162fab9bd9bcf", size = 62794 },
 ]
 
+[[package]]
+name = "github-copilot-sdk"
+version = "0.1.20"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pydantic" },
+    { name = "python-dateutil" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/02/7d/afde0ec85815a558612130dc5ff79536299f411e672410c3edc0c1edeb2a/github_copilot_sdk-0.1.20.tar.gz", hash = "sha256:9e89cd46577fd18dd808d7113b7e20e021c4f944121a0a4891945460fb26c53c", size = 92207 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/55/91/f8cfa809184988a273af58824b312d31a532ee3ee70875100b5061540178/github_copilot_sdk-0.1.20-py3-none-any.whl", hash = "sha256:e7fa1bb843e2494930126551b80f3a035f36c47a05f9173ad0cdfb4151ad9346", size = 40306 },
+]
+
 [[package]]
 name = "gitpython"
 version = "3.1.45"
@@ -1321,6 +1462,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl", hash = "sha256:e5dd1551894c77868a30651cef00984d50e1002d06942a7101d34870c5f02afd", size = 27656 },
 ]
 
+[[package]]
+name = "iniconfig"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484 },
+]
+
 [[package]]
 name = "isodate"
 version = "0.7.2"
@@ -1419,6 +1569,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2f/9c/6753e6522b8d0ef07d3a3d239426669e984fb0eba15a315cdbc1253904e4/jiter-0.12.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c24e864cb30ab82311c6425655b0cdab0a98c5d973b065c66a3f020740c2324c", size = 346110 },
 ]
 
+[[package]]
+name = "joblib"
+version = "1.5.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/41/f2/d34e8b3a08a9cc79a50b2208a93dce981fe615b64d5a4d4abee421d898df/joblib-1.5.3.tar.gz", hash = "sha256:8561a3269e6801106863fd0d6d84bb737be9e7631e33aaed3fb9ce5953688da3", size = 331603 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071 },
+]
+
 [[package]]
 name = "jsonpath-ng"
 version = "1.7.0"
@@ -1676,6 +1835,22 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5e/75/bd9b7bb966668920f06b200e84454c8f3566b102183bc55c5473d96cb2b9/msal_extensions-1.3.1-py3-none-any.whl", hash = "sha256:96d3de4d034504e969ac5e85bae8106c8373b5c6568e4c8fa7af2eca9dbe6bca", size = 20583 },
 ]
 
+[[package]]
+name = "msrest"
+version = "0.7.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "azure-core" },
+    { name = "certifi" },
+    { name = "isodate" },
+    { name = "requests" },
+    { name = "requests-oauthlib" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/68/77/8397c8fb8fc257d8ea0fa66f8068e073278c65f05acb17dcb22a02bfdc42/msrest-0.7.1.zip", hash = "sha256:6e7661f46f3afd88b75667b7187a92829924446c7ea1d169be8c4bb7eeb788b9", size = 175332 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/15/cf/f2966a2638144491f8696c27320d5219f48a072715075d168b31d3237720/msrest-0.7.1-py3-none-any.whl", hash = "sha256:21120a810e1233e5e6cc7fe40b474eeb4ec6f757a15d7cf86702c369f9567c32", size = 85384 },
+]
+
 [[package]]
 name = "multidict"
 version = "6.7.0"
@@ -1784,6 +1959,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c0/a1/4d21933898e23b011ae0528151b57a9230a62960d0919bf2ee48c7f5c20a/narwhals-2.11.0-py3-none-any.whl", hash = "sha256:a9795e1e44aa94e5ba6406ef1c5ee4c172414ced4f1aea4a79e5894f0c7378d4", size = 423069 },
 ]
 
+[[package]]
+name = "nltk"
+version = "3.9.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "joblib" },
+    { name = "regex" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f9/76/3a5e4312c19a028770f86fd7c058cf9f4ec4321c6cf7526bab998a5b683c/nltk-3.9.2.tar.gz", hash = "sha256:0f409e9b069ca4177c1903c3e843eef90c7e92992fa4931ae607da6de49e1419", size = 2887629 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/60/90/81ac364ef94209c100e12579629dc92bf7a709a84af32f8c551b02c07e94/nltk-3.9.2-py3-none-any.whl", hash = "sha256:1e209d2b3009110635ed9709a67a1a3e33a10f799490fa71cf4bec218c11c88a", size = 1513404 },
+]
+
 [[package]]
 name = "numpy"
 version = "2.3.5"
@@ -1847,6 +2037,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2d/fd/4b5eb0b3e888d86aee4d198c23acec7d214baaf17ea93c1adec94c9518b9/numpy-2.3.5-cp314-cp314t-win_arm64.whl", hash = "sha256:6203fdf9f3dc5bdaed7319ad8698e685c7a3be10819f41d32a0723e611733b42", size = 10545459 },
 ]
 
+[[package]]
+name = "oauthlib"
+version = "3.3.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0b/5f/19930f824ffeb0ad4372da4812c50edbd1434f678c90c2733e1188edfc63/oauthlib-3.3.1.tar.gz", hash = "sha256:0f0f8aa759826a193cf66c12ea1af1637f87b9b4622d46e866952bb022e538c9", size = 185918 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/be/9c/92789c596b8df838baa98fa71844d84283302f7604ed565dafe5a6b5041a/oauthlib-3.3.1-py3-none-any.whl", hash = "sha256:88119c938d2b8fb88561af5f6ee0eec8cc8d552b7bb1f712743136eb7523b7a1", size = 160065 },
+]
+
 [[package]]
 name = "ollama"
 version = "0.6.1"
@@ -2096,6 +2295,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835 },
 ]
 
+[[package]]
+name = "pluggy"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538 },
+]
+
 [[package]]
 name = "ply"
 version = "3.11"
@@ -2139,8 +2347,8 @@ name = "powerfx"
 version = "0.0.34"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "cffi" },
-    { name = "pythonnet" },
+    { name = "cffi", marker = "python_full_version < '3.14'" },
+    { name = "pythonnet", marker = "python_full_version < '3.14'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/9f/fb/6c4bf87e0c74ca1c563921ce89ca1c5785b7576bca932f7255cdf81082a7/powerfx-0.0.34.tar.gz", hash = "sha256:956992e7afd272657ed16d80f4cad24ec95d9e4a79fb9dfa4a068a09e136af32", size = 3237555 }
 wheels = [
@@ -2414,6 +2622,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ab/4c/b888e6cf58bd9db9c93f40d1c6be8283ff49d88919231afe93a6bcf61626/pydeck-0.9.1-py2.py3-none-any.whl", hash = "sha256:b3f75ba0d273fc917094fa61224f3f6076ca8752b93d46faf3bcfd9f9d59b038", size = 6900403 },
 ]
 
+[[package]]
+name = "pygments"
+version = "2.19.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217 },
+]
+
 [[package]]
 name = "pyjwt"
 version = "2.10.1"
@@ -2428,6 +2645,47 @@ crypto = [
     { name = "cryptography" },
 ]
 
+[[package]]
+name = "pytest"
+version = "9.0.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "iniconfig" },
+    { name = "packaging" },
+    { name = "pluggy" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801 },
+]
+
+[[package]]
+name = "pytest-asyncio"
+version = "1.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pytest" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/90/2c/8af215c0f776415f3590cac4f9086ccefd6fd463befeae41cd4d3f193e5a/pytest_asyncio-1.3.0.tar.gz", hash = "sha256:d7f52f36d231b80ee124cd216ffb19369aa168fc10095013c6b014a34d3ee9e5", size = 50087 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e5/35/f8b19922b6a25bc0880171a2f1a003eaeb93657475193ab516fd87cac9da/pytest_asyncio-1.3.0-py3-none-any.whl", hash = "sha256:611e26147c7f77640e6d0a92a38ed17c3e9848063698d5c93d5aa7aa11cebff5", size = 15075 },
+]
+
+[[package]]
+name = "pytest-timeout"
+version = "2.4.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pytest" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ac/82/4c9ecabab13363e72d880f2fb504c5f750433b2b6f16e99f4ec21ada284c/pytest_timeout-2.4.0.tar.gz", hash = "sha256:7e68e90b01f9eff71332b25001f85c75495fc4e3a836701876183c4bcfd0540a", size = 17973 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fa/b6/3127540ecdf1464a00e5a01ee60a1b09175f6913f0644ac748494d9c4b21/pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2", size = 14382 },
+]
+
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
@@ -2472,7 +2730,7 @@ name = "pythonnet"
 version = "3.0.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "clr-loader" },
+    { name = "clr-loader", marker = "python_full_version < '3.14'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/9a/d6/1afd75edd932306ae9bd2c2d961d603dc2b52fcec51b04afea464f1f6646/pythonnet-3.0.5.tar.gz", hash = "sha256:48e43ca463941b3608b32b4e236db92d8d40db4c58a75ace902985f76dac21cf", size = 239212 }
 wheels = [
@@ -2610,6 +2868,94 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c1/b1/3baf80dc6d2b7bc27a95a67752d0208e410351e3feb4eb78de5f77454d8d/referencing-0.36.2-py3-none-any.whl", hash = "sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0", size = 26775 },
 ]
 
+[[package]]
+name = "regex"
+version = "2026.1.15"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0b/86/07d5056945f9ec4590b518171c4254a5925832eb727b56d3c38a7476f316/regex-2026.1.15.tar.gz", hash = "sha256:164759aa25575cbc0651bef59a0b18353e54300d79ace8084c818ad8ac72b7d5", size = 414811 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/92/81/10d8cf43c807d0326efe874c1b79f22bfb0fb226027b0b19ebc26d301408/regex-2026.1.15-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:4c8fcc5793dde01641a35905d6731ee1548f02b956815f8f1cab89e515a5bdf1", size = 489398 },
+    { url = "https://files.pythonhosted.org/packages/90/b0/7c2a74e74ef2a7c32de724658a69a862880e3e4155cba992ba04d1c70400/regex-2026.1.15-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bfd876041a956e6a90ad7cdb3f6a630c07d491280bfeed4544053cd434901681", size = 291339 },
+    { url = "https://files.pythonhosted.org/packages/19/4d/16d0773d0c818417f4cc20aa0da90064b966d22cd62a8c46765b5bd2d643/regex-2026.1.15-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9250d087bc92b7d4899ccd5539a1b2334e44eee85d848c4c1aef8e221d3f8c8f", size = 289003 },
+    { url = "https://files.pythonhosted.org/packages/c6/e4/1fc4599450c9f0863d9406e944592d968b8d6dfd0d552a7d569e43bceada/regex-2026.1.15-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c8a154cf6537ebbc110e24dabe53095e714245c272da9c1be05734bdad4a61aa", size = 798656 },
+    { url = "https://files.pythonhosted.org/packages/b2/e6/59650d73a73fa8a60b3a590545bfcf1172b4384a7df2e7fe7b9aab4e2da9/regex-2026.1.15-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8050ba2e3ea1d8731a549e83c18d2f0999fbc99a5f6bd06b4c91449f55291804", size = 864252 },
+    { url = "https://files.pythonhosted.org/packages/6e/ab/1d0f4d50a1638849a97d731364c9a80fa304fec46325e48330c170ee8e80/regex-2026.1.15-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0bf065240704cb8951cc04972cf107063917022511273e0969bdb34fc173456c", size = 912268 },
+    { url = "https://files.pythonhosted.org/packages/dd/df/0d722c030c82faa1d331d1921ee268a4e8fb55ca8b9042c9341c352f17fa/regex-2026.1.15-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c32bef3e7aeee75746748643667668ef941d28b003bfc89994ecf09a10f7a1b5", size = 803589 },
+    { url = "https://files.pythonhosted.org/packages/66/23/33289beba7ccb8b805c6610a8913d0131f834928afc555b241caabd422a9/regex-2026.1.15-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d5eaa4a4c5b1906bd0d2508d68927f15b81821f85092e06f1a34a4254b0e1af3", size = 775700 },
+    { url = "https://files.pythonhosted.org/packages/e7/65/bf3a42fa6897a0d3afa81acb25c42f4b71c274f698ceabd75523259f6688/regex-2026.1.15-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:86c1077a3cc60d453d4084d5b9649065f3bf1184e22992bd322e1f081d3117fb", size = 787928 },
+    { url = "https://files.pythonhosted.org/packages/f4/f5/13bf65864fc314f68cdd6d8ca94adcab064d4d39dbd0b10fef29a9da48fc/regex-2026.1.15-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:2b091aefc05c78d286657cd4db95f2e6313375ff65dcf085e42e4c04d9c8d410", size = 858607 },
+    { url = "https://files.pythonhosted.org/packages/a3/31/040e589834d7a439ee43fb0e1e902bc81bd58a5ba81acffe586bb3321d35/regex-2026.1.15-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:57e7d17f59f9ebfa9667e6e5a1c0127b96b87cb9cede8335482451ed00788ba4", size = 763729 },
+    { url = "https://files.pythonhosted.org/packages/9b/84/6921e8129687a427edf25a34a5594b588b6d88f491320b9de5b6339a4fcb/regex-2026.1.15-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:c6c4dcdfff2c08509faa15d36ba7e5ef5fcfab25f1e8f85a0c8f45bc3a30725d", size = 850697 },
+    { url = "https://files.pythonhosted.org/packages/8a/87/3d06143d4b128f4229158f2de5de6c8f2485170c7221e61bf381313314b2/regex-2026.1.15-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:cf8ff04c642716a7f2048713ddc6278c5fd41faa3b9cab12607c7abecd012c22", size = 789849 },
+    { url = "https://files.pythonhosted.org/packages/77/69/c50a63842b6bd48850ebc7ab22d46e7a2a32d824ad6c605b218441814639/regex-2026.1.15-cp312-cp312-win32.whl", hash = "sha256:82345326b1d8d56afbe41d881fdf62f1926d7264b2fc1537f99ae5da9aad7913", size = 266279 },
+    { url = "https://files.pythonhosted.org/packages/f2/36/39d0b29d087e2b11fd8191e15e81cce1b635fcc845297c67f11d0d19274d/regex-2026.1.15-cp312-cp312-win_amd64.whl", hash = "sha256:4def140aa6156bc64ee9912383d4038f3fdd18fee03a6f222abd4de6357ce42a", size = 277166 },
+    { url = "https://files.pythonhosted.org/packages/28/32/5b8e476a12262748851fa8ab1b0be540360692325975b094e594dfebbb52/regex-2026.1.15-cp312-cp312-win_arm64.whl", hash = "sha256:c6c565d9a6e1a8d783c1948937ffc377dd5771e83bd56de8317c450a954d2056", size = 270415 },
+    { url = "https://files.pythonhosted.org/packages/f8/2e/6870bb16e982669b674cce3ee9ff2d1d46ab80528ee6bcc20fb2292efb60/regex-2026.1.15-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:e69d0deeb977ffe7ed3d2e4439360089f9c3f217ada608f0f88ebd67afb6385e", size = 489164 },
+    { url = "https://files.pythonhosted.org/packages/dc/67/9774542e203849b0286badf67199970a44ebdb0cc5fb739f06e47ada72f8/regex-2026.1.15-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3601ffb5375de85a16f407854d11cca8fe3f5febbe3ac78fb2866bb220c74d10", size = 291218 },
+    { url = "https://files.pythonhosted.org/packages/b2/87/b0cda79f22b8dee05f774922a214da109f9a4c0eca5da2c9d72d77ea062c/regex-2026.1.15-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4c5ef43b5c2d4114eb8ea424bb8c9cec01d5d17f242af88b2448f5ee81caadbc", size = 288895 },
+    { url = "https://files.pythonhosted.org/packages/3b/6a/0041f0a2170d32be01ab981d6346c83a8934277d82c780d60b127331f264/regex-2026.1.15-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:968c14d4f03e10b2fd960f1d5168c1f0ac969381d3c1fcc973bc45fb06346599", size = 798680 },
+    { url = "https://files.pythonhosted.org/packages/58/de/30e1cfcdbe3e891324aa7568b7c968771f82190df5524fabc1138cb2d45a/regex-2026.1.15-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:56a5595d0f892f214609c9f76b41b7428bed439d98dc961efafdd1354d42baae", size = 864210 },
+    { url = "https://files.pythonhosted.org/packages/64/44/4db2f5c5ca0ccd40ff052ae7b1e9731352fcdad946c2b812285a7505ca75/regex-2026.1.15-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0bf650f26087363434c4e560011f8e4e738f6f3e029b85d4904c50135b86cfa5", size = 912358 },
+    { url = "https://files.pythonhosted.org/packages/79/b6/e6a5665d43a7c42467138c8a2549be432bad22cbd206f5ec87162de74bd7/regex-2026.1.15-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18388a62989c72ac24de75f1449d0fb0b04dfccd0a1a7c1c43af5eb503d890f6", size = 803583 },
+    { url = "https://files.pythonhosted.org/packages/e7/53/7cd478222169d85d74d7437e74750005e993f52f335f7c04ff7adfda3310/regex-2026.1.15-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6d220a2517f5893f55daac983bfa9fe998a7dbcaee4f5d27a88500f8b7873788", size = 775782 },
+    { url = "https://files.pythonhosted.org/packages/ca/b5/75f9a9ee4b03a7c009fe60500fe550b45df94f0955ca29af16333ef557c5/regex-2026.1.15-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c9c08c2fbc6120e70abff5d7f28ffb4d969e14294fb2143b4b5c7d20e46d1714", size = 787978 },
+    { url = "https://files.pythonhosted.org/packages/72/b3/79821c826245bbe9ccbb54f6eadb7879c722fd3e0248c17bfc90bf54e123/regex-2026.1.15-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:7ef7d5d4bd49ec7364315167a4134a015f61e8266c6d446fc116a9ac4456e10d", size = 858550 },
+    { url = "https://files.pythonhosted.org/packages/4a/85/2ab5f77a1c465745bfbfcb3ad63178a58337ae8d5274315e2cc623a822fa/regex-2026.1.15-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:6e42844ad64194fa08d5ccb75fe6a459b9b08e6d7296bd704460168d58a388f3", size = 763747 },
+    { url = "https://files.pythonhosted.org/packages/6d/84/c27df502d4bfe2873a3e3a7cf1bdb2b9cc10284d1a44797cf38bed790470/regex-2026.1.15-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:cfecdaa4b19f9ca534746eb3b55a5195d5c95b88cac32a205e981ec0a22b7d31", size = 850615 },
+    { url = "https://files.pythonhosted.org/packages/7d/b7/658a9782fb253680aa8ecb5ccbb51f69e088ed48142c46d9f0c99b46c575/regex-2026.1.15-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:08df9722d9b87834a3d701f3fca570b2be115654dbfd30179f30ab2f39d606d3", size = 789951 },
+    { url = "https://files.pythonhosted.org/packages/fc/2a/5928af114441e059f15b2f63e188bd00c6529b3051c974ade7444b85fcda/regex-2026.1.15-cp313-cp313-win32.whl", hash = "sha256:d426616dae0967ca225ab12c22274eb816558f2f99ccb4a1d52ca92e8baf180f", size = 266275 },
+    { url = "https://files.pythonhosted.org/packages/4f/16/5bfbb89e435897bff28cf0352a992ca719d9e55ebf8b629203c96b6ce4f7/regex-2026.1.15-cp313-cp313-win_amd64.whl", hash = "sha256:febd38857b09867d3ed3f4f1af7d241c5c50362e25ef43034995b77a50df494e", size = 277145 },
+    { url = "https://files.pythonhosted.org/packages/56/c1/a09ff7392ef4233296e821aec5f78c51be5e91ffde0d163059e50fd75835/regex-2026.1.15-cp313-cp313-win_arm64.whl", hash = "sha256:8e32f7896f83774f91499d239e24cebfadbc07639c1494bb7213983842348337", size = 270411 },
+    { url = "https://files.pythonhosted.org/packages/3c/38/0cfd5a78e5c6db00e6782fdae70458f89850ce95baa5e8694ab91d89744f/regex-2026.1.15-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:ec94c04149b6a7b8120f9f44565722c7ae31b7a6d2275569d2eefa76b83da3be", size = 492068 },
+    { url = "https://files.pythonhosted.org/packages/50/72/6c86acff16cb7c959c4355826bbf06aad670682d07c8f3998d9ef4fee7cd/regex-2026.1.15-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:40c86d8046915bb9aeb15d3f3f15b6fd500b8ea4485b30e1bbc799dab3fe29f8", size = 292756 },
+    { url = "https://files.pythonhosted.org/packages/4e/58/df7fb69eadfe76526ddfce28abdc0af09ffe65f20c2c90932e89d705153f/regex-2026.1.15-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:726ea4e727aba21643205edad8f2187ec682d3305d790f73b7a51c7587b64bdd", size = 291114 },
+    { url = "https://files.pythonhosted.org/packages/ed/6c/a4011cd1cf96b90d2cdc7e156f91efbd26531e822a7fbb82a43c1016678e/regex-2026.1.15-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1cb740d044aff31898804e7bf1181cc72c03d11dfd19932b9911ffc19a79070a", size = 807524 },
+    { url = "https://files.pythonhosted.org/packages/1d/25/a53ffb73183f69c3e9f4355c4922b76d2840aee160af6af5fac229b6201d/regex-2026.1.15-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:05d75a668e9ea16f832390d22131fe1e8acc8389a694c8febc3e340b0f810b93", size = 873455 },
+    { url = "https://files.pythonhosted.org/packages/66/0b/8b47fc2e8f97d9b4a851736f3890a5f786443aa8901061c55f24c955f45b/regex-2026.1.15-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d991483606f3dbec93287b9f35596f41aa2e92b7c2ebbb935b63f409e243c9af", size = 915007 },
+    { url = "https://files.pythonhosted.org/packages/c2/fa/97de0d681e6d26fabe71968dbee06dd52819e9a22fdce5dac7256c31ed84/regex-2026.1.15-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:194312a14819d3e44628a44ed6fea6898fdbecb0550089d84c403475138d0a09", size = 812794 },
+    { url = "https://files.pythonhosted.org/packages/22/38/e752f94e860d429654aa2b1c51880bff8dfe8f084268258adf9151cf1f53/regex-2026.1.15-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fe2fda4110a3d0bc163c2e0664be44657431440722c5c5315c65155cab92f9e5", size = 781159 },
+    { url = "https://files.pythonhosted.org/packages/e9/a7/d739ffaef33c378fc888302a018d7f81080393d96c476b058b8c64fd2b0d/regex-2026.1.15-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:124dc36c85d34ef2d9164da41a53c1c8c122cfb1f6e1ec377a1f27ee81deb794", size = 795558 },
+    { url = "https://files.pythonhosted.org/packages/3e/c4/542876f9a0ac576100fc73e9c75b779f5c31e3527576cfc9cb3009dcc58a/regex-2026.1.15-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:a1774cd1981cd212506a23a14dba7fdeaee259f5deba2df6229966d9911e767a", size = 868427 },
+    { url = "https://files.pythonhosted.org/packages/fc/0f/d5655bea5b22069e32ae85a947aa564912f23758e112cdb74212848a1a1b/regex-2026.1.15-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:b5f7d8d2867152cdb625e72a530d2ccb48a3d199159144cbdd63870882fb6f80", size = 769939 },
+    { url = "https://files.pythonhosted.org/packages/20/06/7e18a4fa9d326daeda46d471a44ef94201c46eaa26dbbb780b5d92cbfdda/regex-2026.1.15-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:492534a0ab925d1db998defc3c302dae3616a2fc3fe2e08db1472348f096ddf2", size = 854753 },
+    { url = "https://files.pythonhosted.org/packages/3b/67/dc8946ef3965e166f558ef3b47f492bc364e96a265eb4a2bb3ca765c8e46/regex-2026.1.15-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c661fc820cfb33e166bf2450d3dadbda47c8d8981898adb9b6fe24e5e582ba60", size = 799559 },
+    { url = "https://files.pythonhosted.org/packages/a5/61/1bba81ff6d50c86c65d9fd84ce9699dd106438ee4cdb105bf60374ee8412/regex-2026.1.15-cp313-cp313t-win32.whl", hash = "sha256:99ad739c3686085e614bf77a508e26954ff1b8f14da0e3765ff7abbf7799f952", size = 268879 },
+    { url = "https://files.pythonhosted.org/packages/e9/5e/cef7d4c5fb0ea3ac5c775fd37db5747f7378b29526cc83f572198924ff47/regex-2026.1.15-cp313-cp313t-win_amd64.whl", hash = "sha256:32655d17905e7ff8ba5c764c43cb124e34a9245e45b83c22e81041e1071aee10", size = 280317 },
+    { url = "https://files.pythonhosted.org/packages/b4/52/4317f7a5988544e34ab57b4bde0f04944c4786128c933fb09825924d3e82/regex-2026.1.15-cp313-cp313t-win_arm64.whl", hash = "sha256:b2a13dd6a95e95a489ca242319d18fc02e07ceb28fa9ad146385194d95b3c829", size = 271551 },
+    { url = "https://files.pythonhosted.org/packages/52/0a/47fa888ec7cbbc7d62c5f2a6a888878e76169170ead271a35239edd8f0e8/regex-2026.1.15-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:d920392a6b1f353f4aa54328c867fec3320fa50657e25f64abf17af054fc97ac", size = 489170 },
+    { url = "https://files.pythonhosted.org/packages/ac/c4/d000e9b7296c15737c9301708e9e7fbdea009f8e93541b6b43bdb8219646/regex-2026.1.15-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b5a28980a926fa810dbbed059547b02783952e2efd9c636412345232ddb87ff6", size = 291146 },
+    { url = "https://files.pythonhosted.org/packages/f9/b6/921cc61982e538682bdf3bdf5b2c6ab6b34368da1f8e98a6c1ddc503c9cf/regex-2026.1.15-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:621f73a07595d83f28952d7bd1e91e9d1ed7625fb7af0064d3516674ec93a2a2", size = 288986 },
+    { url = "https://files.pythonhosted.org/packages/ca/33/eb7383dde0bbc93f4fb9d03453aab97e18ad4024ac7e26cef8d1f0a2cff0/regex-2026.1.15-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3d7d92495f47567a9b1669c51fc8d6d809821849063d168121ef801bbc213846", size = 799098 },
+    { url = "https://files.pythonhosted.org/packages/27/56/b664dccae898fc8d8b4c23accd853f723bde0f026c747b6f6262b688029c/regex-2026.1.15-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8dd16fba2758db7a3780a051f245539c4451ca20910f5a5e6ea1c08d06d4a76b", size = 864980 },
+    { url = "https://files.pythonhosted.org/packages/16/40/0999e064a170eddd237bae9ccfcd8f28b3aa98a38bf727a086425542a4fc/regex-2026.1.15-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:1e1808471fbe44c1a63e5f577a1d5f02fe5d66031dcbdf12f093ffc1305a858e", size = 911607 },
+    { url = "https://files.pythonhosted.org/packages/07/78/c77f644b68ab054e5a674fb4da40ff7bffb2c88df58afa82dbf86573092d/regex-2026.1.15-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0751a26ad39d4f2ade8fe16c59b2bf5cb19eb3d2cd543e709e583d559bd9efde", size = 803358 },
+    { url = "https://files.pythonhosted.org/packages/27/31/d4292ea8566eaa551fafc07797961c5963cf5235c797cc2ae19b85dfd04d/regex-2026.1.15-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0f0c7684c7f9ca241344ff95a1de964f257a5251968484270e91c25a755532c5", size = 775833 },
+    { url = "https://files.pythonhosted.org/packages/ce/b2/cff3bf2fea4133aa6fb0d1e370b37544d18c8350a2fa118c7e11d1db0e14/regex-2026.1.15-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:74f45d170a21df41508cb67165456538425185baaf686281fa210d7e729abc34", size = 788045 },
+    { url = "https://files.pythonhosted.org/packages/8d/99/2cb9b69045372ec877b6f5124bda4eb4253bc58b8fe5848c973f752bc52c/regex-2026.1.15-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:f1862739a1ffb50615c0fde6bae6569b5efbe08d98e59ce009f68a336f64da75", size = 859374 },
+    { url = "https://files.pythonhosted.org/packages/09/16/710b0a5abe8e077b1729a562d2f297224ad079f3a66dce46844c193416c8/regex-2026.1.15-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:453078802f1b9e2b7303fb79222c054cb18e76f7bdc220f7530fdc85d319f99e", size = 763940 },
+    { url = "https://files.pythonhosted.org/packages/dd/d1/7585c8e744e40eb3d32f119191969b91de04c073fca98ec14299041f6e7e/regex-2026.1.15-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:a30a68e89e5a218b8b23a52292924c1f4b245cb0c68d1cce9aec9bbda6e2c160", size = 850112 },
+    { url = "https://files.pythonhosted.org/packages/af/d6/43e1dd85df86c49a347aa57c1f69d12c652c7b60e37ec162e3096194a278/regex-2026.1.15-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:9479cae874c81bf610d72b85bb681a94c95722c127b55445285fb0e2c82db8e1", size = 789586 },
+    { url = "https://files.pythonhosted.org/packages/93/38/77142422f631e013f316aaae83234c629555729a9fbc952b8a63ac91462a/regex-2026.1.15-cp314-cp314-win32.whl", hash = "sha256:d639a750223132afbfb8f429c60d9d318aeba03281a5f1ab49f877456448dcf1", size = 271691 },
+    { url = "https://files.pythonhosted.org/packages/4a/a9/ab16b4649524ca9e05213c1cdbb7faa85cc2aa90a0230d2f796cbaf22736/regex-2026.1.15-cp314-cp314-win_amd64.whl", hash = "sha256:4161d87f85fa831e31469bfd82c186923070fc970b9de75339b68f0c75b51903", size = 280422 },
+    { url = "https://files.pythonhosted.org/packages/be/2a/20fd057bf3521cb4791f69f869635f73e0aaf2b9ad2d260f728144f9047c/regex-2026.1.15-cp314-cp314-win_arm64.whl", hash = "sha256:91c5036ebb62663a6b3999bdd2e559fd8456d17e2b485bf509784cd31a8b1705", size = 273467 },
+    { url = "https://files.pythonhosted.org/packages/ad/77/0b1e81857060b92b9cad239104c46507dd481b3ff1fa79f8e7f865aae38a/regex-2026.1.15-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:ee6854c9000a10938c79238de2379bea30c82e4925a371711af45387df35cab8", size = 492073 },
+    { url = "https://files.pythonhosted.org/packages/70/f3/f8302b0c208b22c1e4f423147e1913fd475ddd6230565b299925353de644/regex-2026.1.15-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2c2b80399a422348ce5de4fe40c418d6299a0fa2803dd61dc0b1a2f28e280fcf", size = 292757 },
+    { url = "https://files.pythonhosted.org/packages/bf/f0/ef55de2460f3b4a6da9d9e7daacd0cb79d4ef75c64a2af316e68447f0df0/regex-2026.1.15-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:dca3582bca82596609959ac39e12b7dad98385b4fefccb1151b937383cec547d", size = 291122 },
+    { url = "https://files.pythonhosted.org/packages/cf/55/bb8ccbacabbc3a11d863ee62a9f18b160a83084ea95cdfc5d207bfc3dd75/regex-2026.1.15-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ef71d476caa6692eea743ae5ea23cde3260677f70122c4d258ca952e5c2d4e84", size = 807761 },
+    { url = "https://files.pythonhosted.org/packages/8f/84/f75d937f17f81e55679a0509e86176e29caa7298c38bd1db7ce9c0bf6075/regex-2026.1.15-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c243da3436354f4af6c3058a3f81a97d47ea52c9bd874b52fd30274853a1d5df", size = 873538 },
+    { url = "https://files.pythonhosted.org/packages/b8/d9/0da86327df70349aa8d86390da91171bd3ca4f0e7c1d1d453a9c10344da3/regex-2026.1.15-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8355ad842a7c7e9e5e55653eade3b7d1885ba86f124dd8ab1f722f9be6627434", size = 915066 },
+    { url = "https://files.pythonhosted.org/packages/2a/5e/f660fb23fc77baa2a61aa1f1fe3a4eea2bbb8a286ddec148030672e18834/regex-2026.1.15-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f192a831d9575271a22d804ff1a5355355723f94f31d9eef25f0d45a152fdc1a", size = 812938 },
+    { url = "https://files.pythonhosted.org/packages/69/33/a47a29bfecebbbfd1e5cd3f26b28020a97e4820f1c5148e66e3b7d4b4992/regex-2026.1.15-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:166551807ec20d47ceaeec380081f843e88c8949780cd42c40f18d16168bed10", size = 781314 },
+    { url = "https://files.pythonhosted.org/packages/65/ec/7ec2bbfd4c3f4e494a24dec4c6943a668e2030426b1b8b949a6462d2c17b/regex-2026.1.15-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f9ca1cbdc0fbfe5e6e6f8221ef2309988db5bcede52443aeaee9a4ad555e0dac", size = 795652 },
+    { url = "https://files.pythonhosted.org/packages/46/79/a5d8651ae131fe27d7c521ad300aa7f1c7be1dbeee4d446498af5411b8a9/regex-2026.1.15-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:b30bcbd1e1221783c721483953d9e4f3ab9c5d165aa709693d3f3946747b1aea", size = 868550 },
+    { url = "https://files.pythonhosted.org/packages/06/b7/25635d2809664b79f183070786a5552dd4e627e5aedb0065f4e3cf8ee37d/regex-2026.1.15-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:2a8d7b50c34578d0d3bf7ad58cde9652b7d683691876f83aedc002862a35dc5e", size = 769981 },
+    { url = "https://files.pythonhosted.org/packages/16/8b/fc3fcbb2393dcfa4a6c5ffad92dc498e842df4581ea9d14309fcd3c55fb9/regex-2026.1.15-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:9d787e3310c6a6425eb346be4ff2ccf6eece63017916fd77fe8328c57be83521", size = 854780 },
+    { url = "https://files.pythonhosted.org/packages/d0/38/dde117c76c624713c8a2842530be9c93ca8b606c0f6102d86e8cd1ce8bea/regex-2026.1.15-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:619843841e220adca114118533a574a9cd183ed8a28b85627d2844c500a2b0db", size = 799778 },
+    { url = "https://files.pythonhosted.org/packages/e3/0d/3a6cfa9ae99606afb612d8fb7a66b245a9d5ff0f29bb347c8a30b6ad561b/regex-2026.1.15-cp314-cp314t-win32.whl", hash = "sha256:e90b8db97f6f2c97eb045b51a6b2c5ed69cedd8392459e0642d4199b94fabd7e", size = 274667 },
+    { url = "https://files.pythonhosted.org/packages/5b/b2/297293bb0742fd06b8d8e2572db41a855cdf1cae0bf009b1cb74fe07e196/regex-2026.1.15-cp314-cp314t-win_amd64.whl", hash = "sha256:5ef19071f4ac9f0834793af85bd04a920b4407715624e40cb7a0631a11137cdf", size = 284386 },
+    { url = "https://files.pythonhosted.org/packages/95/e4/a3b9480c78cf8ee86626cb06f8d931d74d775897d44201ccb813097ae697/regex-2026.1.15-cp314-cp314t-win_arm64.whl", hash = "sha256:ca89c5e596fc05b015f27561b3793dc2fa0917ea0d7507eebb448efd35274a70", size = 274837 },
+]
+
 [[package]]
 name = "requests"
 version = "2.32.4"
@@ -2625,6 +2971,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c", size = 64847 },
 ]
 
+[[package]]
+name = "requests-oauthlib"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "oauthlib" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/42/f2/05f29bc3913aea15eb670be136045bf5c5bbf4b99ecb839da9b422bb2c85/requests-oauthlib-2.0.0.tar.gz", hash = "sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9", size = 55650 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3b/5d/63d4ae3b9daea098d5d6f5da83984853c1bbacd5dc826764b249fe119d24/requests_oauthlib-2.0.0-py2.py3-none-any.whl", hash = "sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36", size = 24179 },
+]
+
 [[package]]
 name = "rpds-py"
 version = "0.29.0"
@@ -2718,6 +3077,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696 },
 ]
 
+[[package]]
+name = "ruamel-yaml"
+version = "0.19.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c7/3b/ebda527b56beb90cb7652cb1c7e4f91f48649fbcd8d2eb2fb6e77cd3329b/ruamel_yaml-0.19.1.tar.gz", hash = "sha256:53eb66cd27849eff968ebf8f0bf61f46cdac2da1d1f3576dd4ccee9b25c31993", size = 142709 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b8/0c/51f6841f1d84f404f92463fc2b1ba0da357ca1e3db6b7fbda26956c3b82a/ruamel_yaml-0.19.1-py3-none-any.whl", hash = "sha256:27592957fedf6e0b62f281e96effd28043345e0e66001f97683aa9a40c667c93", size = 118102 },
+]
+
 [[package]]
 name = "six"
 version = "1.17.0"
diff --git a/agentic_ai/evaluations/.gitignore b/agentic_ai/evaluations/.gitignore
new file mode 100644
index 000000000..1d2876728
--- /dev/null
+++ b/agentic_ai/evaluations/.gitignore
@@ -0,0 +1,10 @@
+# Evaluation results directory
+eval_results/
+*.pyc
+__pycache__/
+.pytest_cache/
+*.json.bak
+agent_traces.json
+
+# Generated evaluation data (created by run_agent_eval.py)
+evaluation_input_data.jsonl
diff --git a/agentic_ai/evaluations/README.md b/agentic_ai/evaluations/README.md
new file mode 100644
index 000000000..209542b43
--- /dev/null
+++ b/agentic_ai/evaluations/README.md
@@ -0,0 +1,621 @@
+# AI Agent Evaluation Framework
+
+A comprehensive evaluation system for testing AI agents in customer support scenarios. This framework provides both **local evaluation** with custom metrics and **remote evaluation** via Azure AI Foundry with LLM-as-judge capabilities.
+
+---
+
+## Table of Contents
+
+1. [Evaluation Methodology](#evaluation-methodology)
+   - [Why Evaluate AI Agents?](#why-evaluate-ai-agents)
+   - [Single-Turn vs Multi-Turn Evaluation](#single-turn-vs-multi-turn-evaluation)
+   - [Built-in vs Custom Evaluators](#built-in-vs-custom-evaluators)
+2. [Metrics Deep Dive](#metrics-deep-dive)
+   - [Single-Turn Metrics (Tool-Focused)](#single-turn-metrics-tool-focused)
+   - [Multi-Turn Metrics (Outcome-Focused)](#multi-turn-metrics-outcome-focused)
+3. [Setup Guide](#setup-guide)
+   - [Prerequisites](#prerequisites)
+   - [Step 1: Environment Setup](#step-1-environment-setup)
+   - [Step 2: Azure Configuration](#step-2-azure-configuration)
+   - [Step 3: Start Services](#step-3-start-services)
+4. [Running Evaluations](#running-evaluations)
+   - [Local Evaluation](#local-evaluation)
+   - [Remote Evaluation (Azure AI Foundry)](#remote-evaluation-azure-ai-foundry)
+   - [Comparing Agents](#comparing-agents)
+5. [Interpreting Results](#interpreting-results)
+6. [Extending the Framework](#extending-the-framework)
+7. [Troubleshooting](#troubleshooting)
+
+---
+
+## Evaluation Methodology
+
+### Why Evaluate AI Agents?
+
+AI agents that use tools (APIs, databases, external services) require evaluation that goes beyond traditional NLP metrics. Unlike simple chatbots, agents must:
+
+1. **Choose the right tools** - Select appropriate APIs for each task
+2. **Use tools correctly** - Pass correct parameters and handle responses
+3. **Maintain context** - Remember information across conversation turns
+4. **Achieve outcomes** - Actually solve the customer's problem
+5. **Communicate effectively** - Provide clear, helpful responses
+
+This framework addresses all these dimensions through a combination of **rule-based metrics** (deterministic, fast) and **LLM-as-judge evaluators** (semantic understanding, nuanced assessment).
+
+### Single-Turn vs Multi-Turn Evaluation
+
+We use fundamentally different evaluation strategies for single-turn and multi-turn conversations because they measure different capabilities:
+
+#### Single-Turn Evaluation (Tool-Focused)
+
+**Rationale**: In a single exchange, the agent must immediately demonstrate correct tool selection and usage. There's no opportunity for course correction, so we heavily weight tool-level accuracy.
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│ SINGLE-TURN WEIGHTS                                         │
+├─────────────────────────────────────────────────────────────┤
+│ Tool Behavior (recall, precision, efficiency)     │  10%   │
+│ Tool Call Accuracy (LLM-judge)                    │  15%   │
+│ Task Adherence (LLM-judge)                        │  10%   │
+│ Completeness (success criteria met)               │  10%   │
+│ Response Quality - LLM                            │  10%   │
+│ Response Quality - Basic                          │   5%   │
+│ Grounded Accuracy                                 │  10%   │
+│ Intent Resolution                                 │  10%   │
+│ Coherence                                         │   5%   │
+│ Fluency                                           │   5%   │
+│ Relevance                                         │   5%   │
+│ Solution Accuracy                                 │  10%   │  ← Ground truth match
+└─────────────────────────────────────────────────────────────┘
+```
+
+**Use cases**: Quick lookups, simple queries, one-shot requests
+
+#### Multi-Turn Evaluation (Outcome-Focused)
+
+**Rationale**: In multi-turn conversations, what matters is the **final outcome**, not the path taken. An agent might take different tool sequences across turns but still successfully resolve the customer's issue. Penalizing intermediate tool choices would be counterproductive.
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│ MULTI-TURN WEIGHTS                                          │
+├─────────────────────────────────────────────────────────────┤
+│ Solution Accuracy                                 │  30%   │  ← Did we solve it?
+│ Task Adherence                                    │  20%   │  ← Proper procedure?
+│ Intent Resolution                                 │  20%   │  ← All intents handled?
+│ Coherence                                         │  10%   │  ← Logical conversation?
+│ Fluency                                           │  10%   │  ← Quality communication?
+│ Relevance                                         │  10%   │  ← Stayed on topic?
+├─────────────────────────────────────────────────────────────┤
+│ ❌ Tool metrics EXCLUDED - we care about outcomes           │
+└─────────────────────────────────────────────────────────────┘
+```
+
+**Use cases**: Complex problem resolution, account changes requiring multiple steps, escalation flows
+
+### Built-in vs Custom Evaluators
+
+We combine two types of evaluators to get the best of both worlds:
+
+#### Azure AI Foundry Built-in Evaluators
+
+**Why use them:**
+- ✅ Industry-standard LLM-as-judge implementations
+- ✅ Consistent with Azure AI Foundry portal metrics
+- ✅ Maintained and improved by Microsoft
+- ✅ 1-5 scale matching portal visualization
+
+| Evaluator | What it Measures |
+|-----------|------------------|
+| `IntentResolutionEvaluator` | Did the agent understand the customer's intent? |
+| `TaskAdherenceEvaluator` | Did the agent follow proper procedures? |
+| `ToolCallAccuracyEvaluator` | Were tool calls correct and appropriate? |
+| `CoherenceEvaluator` | Was the response logically structured? |
+| `FluencyEvaluator` | Was the language natural and grammatically correct? |
+| `RelevanceEvaluator` | Was the response relevant to the question? |
+
+#### Custom Evaluators
+
+**Why we need them:**
+- 🔧 Domain-specific logic (e.g., billing-specific success criteria)
+- 🎯 Ground truth matching against expected solutions
+- ⚡ Deterministic rules (fast, no API calls needed)
+- 📊 Tool behavior metrics unique to our agent patterns
+
+| Evaluator | What it Measures | Why Custom? |
+|-----------|------------------|-------------|
+| `ToolBehaviorEvaluator` | Recall, precision, efficiency of tool usage | Requires domain knowledge of expected tools |
+| `CompletenessEvaluator` | Success criteria satisfaction | Maps criteria to specific tool requirements |
+| `GroundedAccuracyEvaluator` | Response grounded in tool outputs | Needs access to tool call results |
+| `SolutionAccuracyEvaluator` | Match against ground truth | Uses scenario-specific rubrics |
+
+#### Score Scale: 1-5
+
+All metrics use a **1-5 scale** with a **threshold of 3** for pass/fail:
+
+```
+Score   Meaning              Pass?
+─────   ─────────────────    ─────
+  5     Excellent            ✓
+  4     Good                 ✓
+  3     Acceptable           ✓ (threshold)
+  2     Below expectations   ✗
+  1     Poor                 ✗
+```
+
+This matches the Azure AI Foundry portal visualization, making local and remote evaluation results directly comparable.
+
+---
+
+## Metrics Deep Dive
+
+### Single-Turn Metrics (Tool-Focused)
+
+#### 1. Tool Behavior (10%)
+Combines three sub-metrics:
+- **Recall** (50%): Fraction of required tools actually used
+- **Precision** (30%): Fraction of used tools that were relevant
+- **Efficiency** (20%): Required tools / total tools used
+
+```python
+# Example: Required [get_billing_summary], Used [get_billing_summary, get_customer_detail]
+recall = 1.0      # 1/1 required tools used
+precision = 0.5   # 1/2 used tools were in expected set
+efficiency = 0.5  # 1/2 ratio
+```
+
+#### 2. Tool Call Accuracy (15%) - LLM Judge
+Azure AI Foundry's `ToolCallAccuracyEvaluator` assesses:
+- Were the correct tools selected?
+- Were parameters passed correctly?
+- Was the sequence appropriate?
+
+#### 3. Task Adherence (10%) - LLM Judge
+Evaluates whether the agent followed proper procedures and policies.
+
+#### 4. Completeness (10%)
+Checks if scenario-specific success criteria were met:
+```python
+TOOL_CRITERIA_MAP = {
+    "must_access_billing": ["get_billing_summary", "get_subscription_detail"],
+    "must_check_security_logs": ["get_security_logs"],
+    "must_check_promotions": ["get_eligible_promotions"],
+}
+```
+
+#### 5. Response Quality (15% total)
+- **LLM-based** (10%): Semantic quality assessment
+- **Basic** (5%): Length, formatting, structure checks
+
+#### 6. Grounded Accuracy (10%)
+Verifies response is consistent with tool outputs (no hallucination).
+
+#### 7. Intent Resolution (10%) - LLM Judge
+Did the agent correctly understand what the customer wanted?
+
+#### 8-10. Coherence, Fluency, Relevance (5% each)
+Standard NLG quality metrics via Azure AI Foundry evaluators.
+
+#### 11. Solution Accuracy (10%)
+Compares agent response against expected ground truth solution.
+
+### Multi-Turn Metrics (Outcome-Focused)
+
+For multi-turn conversations, we **exclude tool-level metrics** and focus on outcomes:
+
+| Metric | Weight | Rationale |
+|--------|--------|-----------|
+| Solution Accuracy | 30% | The ultimate measure - did we solve the problem? |
+| Task Adherence | 20% | Did we follow proper procedures throughout? |
+| Intent Resolution | 20% | Were all customer intents (across turns) resolved? |
+| Coherence | 10% | Was the overall conversation logical and consistent? |
+| Fluency | 10% | Was communication quality maintained? |
+| Relevance | 10% | Did responses stay relevant across all turns? |
+
+**Why exclude tool metrics for multi-turn?**
+
+Consider a billing dispute that spans 3 turns:
+1. Customer asks about high bill → Agent retrieves billing summary
+2. Customer asks about specific charge → Agent gets usage data  
+3. Customer requests payment plan → Agent records payment
+
+Evaluating tool accuracy at each turn is misleading because:
+- The "expected" tools depend on previous turn outcomes
+- Alternative valid tool sequences exist
+- What matters is: **Was the dispute resolved?**
+
+---
+
+## Setup Guide
+
+### Prerequisites
+
+- Python 3.10+ with `uv` package manager
+- Azure CLI authenticated (`az login`)
+- Existing `.env` file configured (see main repo [SETUP.md](../../SETUP.md))
+- Azure subscription with:
+  - Azure OpenAI resource (already configured in your `.env`)
+  - Azure AI Project (for remote evaluation)
+
+### Step 1: Environment Setup
+
+If you haven't already set up the repository:
+
+```bash
+# Clone repository
+git clone https://github.com/microsoft/OpenAIWorkshop.git
+cd OpenAIWorkshop
+
+# Install dependencies
+uv sync
+```
+
+### Step 2: Configure Evaluation Variables
+
+Add these variables to your existing `.env` file in `agentic_ai/applications/`:
+
+```bash
+# ============================================================
+# EVALUATION-SPECIFIC CONFIGURATION (add to existing .env)
+# ============================================================
+
+# Azure AI Foundry Project Endpoint (Required for --remote evaluation)
+# Get this from: https://ai.azure.com → Your Project → Settings → Project details
+# Look for "Project endpoint" in the format:
+#   https://<region>.api.azureml.ms/...  (older projects)
+#   https://<account>.services.ai.azure.com/api/projects/<project>  (newer projects)
+AZURE_AI_PROJECT_ENDPOINT=https://your-account.services.ai.azure.com/api/projects/your-project
+
+# Evaluation Model (Optional - defaults to AZURE_OPENAI_CHAT_DEPLOYMENT)
+# Use a separate deployment for evaluation to avoid rate limiting
+# Recommended: gpt-4o or gpt-4o-mini for reliable LLM-as-judge evaluation
+AZURE_OPENAI_EVAL_DEPLOYMENT=gpt-4o-mini
+```
+
+**Where to find the Project Endpoint:**
+1. Go to [Azure AI Foundry](https://ai.azure.com)
+2. Select your project
+3. Click **Settings** → **Project details**
+4. Copy the **Project endpoint** URL
+
+> **Note**: The evaluation uses your existing `AZURE_OPENAI_CHAT_DEPLOYMENT` if `AZURE_OPENAI_EVAL_DEPLOYMENT` is not set. Consider using a separate deployment for evaluation to avoid rate limiting during heavy testing.
+
+**Assign required Azure roles:**
+```bash
+# Azure AI Developer role (required for remote evaluation)
+az role assignment create \
+  --assignee $(az ad signed-in-user show --query id -o tsv) \
+  --role "Azure AI Developer" \
+  --scope /subscriptions/{sub-id}/resourceGroups/{rg}/providers/Microsoft.CognitiveServices/accounts/{ai-project}
+```
+
+### Step 3: Start Services
+
+Start services in this order:
+
+```bash
+# Terminal 1: MCP Server (provides customer data APIs)
+cd mcp
+uv run python mcp_service.py
+# Wait for: "MCP server running on http://localhost:8000"
+
+# Terminal 2: Agent Backend
+cd agentic_ai/applications
+uv run python -m uvicorn backend:app --port 7000 --reload
+# Wait for: "Application startup complete"
+```
+
+Verify services:
+```bash
+curl http://localhost:8000/health  # MCP server
+curl http://localhost:7000/health  # Backend
+```
+
+---
+
+## Running Evaluations
+
+### Command-Line Options
+
+```bash
+cd agentic_ai/applications
+
+uv run python ../evaluations/run_agent_eval.py [OPTIONS]
+```
+
+| Flag | Description |
+|------|-------------|
+| `--agent NAME` | Agent name for tracking (default: from AGENT_MODULE) |
+| `--backend-url URL` | Backend URL (default: http://localhost:7000) |
+| `--local` | Run local evaluation only (default if neither specified) |
+| `--remote` | Push results to Azure AI Foundry |
+| `--single-turn-only` | Run only single-turn test cases |
+| `--multi-turn-only` | Run only multi-turn test cases |
+| `--limit N` | Limit to N test cases (useful for testing) |
+
+### Local Evaluation
+
+Local evaluation runs custom metrics without Azure AI Foundry:
+
+```bash
+# Basic local evaluation (all test cases)
+uv run python ../evaluations/run_agent_eval.py --agent my_agent
+
+# Single-turn only
+uv run python ../evaluations/run_agent_eval.py --agent my_agent --single-turn-only
+
+# Multi-turn only  
+uv run python ../evaluations/run_agent_eval.py --agent my_agent --multi-turn-only
+
+# Quick test with 2 cases
+uv run python ../evaluations/run_agent_eval.py --agent my_agent --limit 2
+```
+
+**Output:**
+```
+================================================================================
+EVALUATION SUMMARY - http://localhost:7000
+================================================================================
+Agent: my_agent
+Total Tests:    30
+Passed:         26 ✓
+Failed:         4 ✗
+Pass Rate:      86.7%
+Average Score:  4.12
+
+Metric Breakdown (1-5 scale, threshold: 3):
+  tool_behavior                   : 4.2/5 ████████████████     ✓
+  completeness                    : 4.5/5 ██████████████████   ✓
+  solution_accuracy               : 3.8/5 ███████████████      ✓
+  coherence                       : 4.6/5 ██████████████████   ✓
+  ...
+```
+
+### Remote Evaluation (Azure AI Foundry)
+
+Remote evaluation pushes results to Azure AI Foundry portal:
+
+```bash
+# Remote only (skip local evaluation)
+uv run python ../evaluations/run_agent_eval.py --agent my_agent --remote
+
+# Both local and remote
+uv run python ../evaluations/run_agent_eval.py --agent my_agent --local --remote
+```
+
+**What happens:**
+1. Runs test cases against agent backend
+2. Generates `evaluation_input_data.jsonl` in Foundry format
+3. Creates evaluation in Azure AI Foundry with built-in evaluators:
+   - `builtin.coherence`
+   - `builtin.fluency`  
+   - `builtin.relevance`
+   - `builtin.groundedness`
+   - `builtin.task_adherence`
+   - `builtin.intent_resolution`
+   - Custom `label_model` for solution_accuracy
+
+**Portal naming convention:**
+- Evaluation: `my_agent - Single Turn | 2026-02-03 14:30`
+- Run: `my_agent | Single Turn | 2026-02-03 14:30`
+
+### Comparing Agents
+
+Compare different agent implementations:
+
+```bash
+# Compare single vs reflection agents
+uv run python ../evaluations/run_agent_eval.py --agent agent_single --remote
+# Restart backend with reflection agent
+uv run python ../evaluations/run_agent_eval.py --agent agent_reflection --remote
+```
+
+View comparison in Azure AI Foundry portal → Evaluations → Compare runs.
+
+---
+
+## Interpreting Results
+
+### Score Thresholds
+
+| Score Range | Meaning | Action |
+|-------------|---------|--------|
+| 4.5 - 5.0 | Excellent | Agent performing optimally |
+| 3.5 - 4.4 | Good | Minor improvements possible |
+| 3.0 - 3.4 | Acceptable | Investigate low-scoring metrics |
+| 2.0 - 2.9 | Below expectations | Requires attention |
+| 1.0 - 1.9 | Poor | Significant issues to fix |
+
+### Common Issues
+
+**Low Tool Behavior Score:**
+- Agent using unnecessary tools (low efficiency)
+- Missing required tools (low recall)
+- Fix: Review agent's tool selection logic
+
+**Low Solution Accuracy:**
+- Agent response doesn't match expected outcome
+- Fix: Check ground truth in dataset, verify agent logic
+
+**Low Coherence/Fluency:**
+- Response structure or language issues
+- Fix: Adjust system prompts for clearer formatting
+
+### Output Files
+
+| File | Description |
+|------|-------------|
+| `eval_results/evaluation_summary.json` | Aggregate scores and pass rates |
+| `eval_results/test_case_results.json` | Per-test-case detailed results |
+| `evaluation_input_data.jsonl` | Foundry-format data for remote evaluation |
+
+---
+
+## Extending the Framework
+
+### Adding Custom Metrics
+
+1. Create evaluator class in `metrics.py`:
+
+```python
+class MyCustomEvaluator:
+    def evaluate(self, response: str, expected: str) -> EvaluationResult:
+        # Your evaluation logic
+        score = ...  # 1-5 scale
+        return EvaluationResult(
+            metric_name="my_metric",
+            metric_type=MetricType.ACCURACY,
+            score=score,
+            passed=score >= 3.0,
+            details={...},
+            explanation="..."
+        )
+```
+
+2. Add to `evaluator.py` weights:
+```python
+SINGLE_TURN_WEIGHTS = {
+    ...
+    "my_metric": 0.05,  # 5% weight
+}
+```
+
+### Adding Test Cases
+
+Add to `eval_dataset.json`:
+
+```json
+{
+  "id": "billing_new_scenario",
+  "customer_query": "Your test query here",
+  "customer_id": 101,
+  "category": "billing",
+  "expected_tools": ["get_billing_summary"],
+  "required_tools": ["get_billing_summary"],
+  "success_criteria": {"must_access_billing": true},
+  "ground_truth_solution": "Expected agent response...",
+  "scoring_rubric": "5: Complete and accurate...",
+  "multi_turn": false
+}
+```
+
+---
+
+## Troubleshooting
+
+### "Missing AZURE_AI_PROJECT_ENDPOINT"
+```bash
+# Add the project endpoint to your .env file
+# Get it from: https://ai.azure.com → Your Project → Settings → Project details
+echo 'AZURE_AI_PROJECT_ENDPOINT=https://your-account.services.ai.azure.com/api/projects/your-project' >> agentic_ai/applications/.env
+```
+
+### "Failed to resolve hostname" / DNS Error
+```bash
+# Placeholder values in .env file - must use real URLs
+grep "AZURE_AI_PROJECT" agentic_ai/applications/.env
+# Should show your actual Azure endpoint, not placeholders like "your-account"
+```
+
+### "Authentication failed"
+```bash
+az login
+az account show
+# Verify Azure AI Developer role is assigned to your account
+```
+
+### "Cannot connect to backend"
+```bash
+# Check services are running
+curl http://localhost:8000/health  # MCP
+curl http://localhost:7000/health  # Backend
+```
+
+### "No evaluation results in Foundry"
+- Verify `--remote` flag was used
+- Check `AZURE_AI_PROJECT_ENDPOINT` is set correctly
+- Wait 1-2 minutes for portal to update
+
+### "Rate limiting" during evaluation
+```bash
+# Use a separate deployment for evaluation
+# Add to your .env:
+AZURE_OPENAI_EVAL_DEPLOYMENT=gpt-4o-mini-eval
+# This avoids sharing quota with your agent's chat deployment
+```
+
+### Low Scores on All Tests
+- Verify MCP server has test data loaded
+- Check agent can access tools (`DISABLE_AUTH=true` in dev)
+- Review agent logs for errors
+
+---
+
+## Environment Variables Reference
+
+| Variable | Required | Description |
+|----------|----------|-------------|
+| `AZURE_AI_PROJECT_ENDPOINT` | For `--remote` | Azure AI Foundry project endpoint URL |
+| `AZURE_OPENAI_EVAL_DEPLOYMENT` | No | Model deployment for LLM-as-judge (defaults to `AZURE_OPENAI_CHAT_DEPLOYMENT`) |
+| `AZURE_OPENAI_CHAT_DEPLOYMENT` | Yes | Default model deployment (used if eval deployment not set) |
+| `AZURE_OPENAI_ENDPOINT` | Yes | Azure OpenAI resource endpoint |
+| `AZURE_OPENAI_API_KEY` | Yes* | Azure OpenAI API key (*or use managed identity) |
+
+---
+
+## Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────────────┐
+│                        Evaluation Framework                             │
+├─────────────────────────────────────────────────────────────────────────┤
+│                                                                         │
+│  run_agent_eval.py                                                      │
+│  ├── Load eval_dataset.json (30 test cases)                            │
+│  ├── Send queries to Agent Backend (HTTP)                              │
+│  ├── Capture tool calls via WebSocket                                  │
+│  └── Run evaluators                                                     │
+│      │                                                                  │
+│      ├── LOCAL EVALUATION (evaluator.py + metrics.py)                  │
+│      │   ├── ToolBehaviorEvaluator (recall, precision, efficiency)     │
+│      │   ├── CompletenessEvaluator (success criteria)                  │
+│      │   ├── ResponseQualityEvaluator (LLM + basic)                    │
+│      │   ├── GroundedAccuracyEvaluator                                 │
+│      │   └── AzureAIEvaluatorSuite (if SDK available)                  │
+│      │       ├── IntentResolutionEvaluator                             │
+│      │       ├── TaskAdherenceEvaluator                                │
+│      │       ├── ToolCallAccuracyEvaluator                             │
+│      │       ├── CoherenceEvaluator                                    │
+│      │       ├── FluencyEvaluator                                      │
+│      │       └── RelevanceEvaluator                                    │
+│      │                                                                  │
+│      └── REMOTE EVALUATION (Azure AI Foundry)                          │
+│          ├── Upload evaluation_input_data.jsonl                        │
+│          ├── Run builtin.* evaluators                                  │
+│          └── Run label_model for solution_accuracy                     │
+│                                                                         │
+└─────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## File Reference
+
+| File | Purpose |
+|------|---------|
+| `run_agent_eval.py` | Main evaluation script - orchestrates tests, local eval, and remote push |
+| `evaluator.py` | Evaluation runner, weight definitions, result aggregation |
+| `metrics.py` | All metric implementations (custom + Azure AI wrappers) |
+| `eval_dataset.json` | 30 test cases with ground truth and rubrics |
+| `telemetry.py` | Azure Monitor tracing configuration |
+
+**Generated files** (in `.gitignore`):
+| File | Purpose |
+|------|---------|
+| `evaluation_input_data.jsonl` | Generated during `--remote` evaluation for Foundry upload |
+| `eval_results/` | Local evaluation results and reports |
+
+---
+
+## License
+
+This project is part of the Microsoft OpenAI Workshop. See [LICENSE](../../LICENSE) for details.
diff --git a/agentic_ai/evaluations/__init__.py b/agentic_ai/evaluations/__init__.py
new file mode 100644
index 000000000..6a3982e3e
--- /dev/null
+++ b/agentic_ai/evaluations/__init__.py
@@ -0,0 +1,29 @@
+"""
+AI Agent Evaluation Framework
+
+Evaluation toolkit for testing AI agents against business scenarios.
+"""
+
+from .evaluator import AgentEvaluationRunner, AgentTrace, TestCaseResult
+from .metrics import (
+    ToolBehaviorEvaluator,
+    CompletenessEvaluator,
+    ResponseQualityEvaluator,
+    GroundedAccuracyEvaluator,
+    EvaluationResult,
+    MetricType
+)
+
+__all__ = [
+    "AgentEvaluationRunner",
+    "AgentTrace",
+    "TestCaseResult",
+    "ToolBehaviorEvaluator",
+    "CompletenessEvaluator",
+    "ResponseQualityEvaluator",
+    "GroundedAccuracyEvaluator",
+    "EvaluationResult",
+    "MetricType"
+]
+
+__version__ = "1.0.0"
diff --git a/agentic_ai/evaluations/eval_dataset.json b/agentic_ai/evaluations/eval_dataset.json
new file mode 100644
index 000000000..9f6b2d1cd
--- /dev/null
+++ b/agentic_ai/evaluations/eval_dataset.json
@@ -0,0 +1,602 @@
+{
+  "version": "2.0",
+  "description": "Comprehensive evaluation dataset for Contoso customer support agents",
+  "metadata": {
+    "total_scenarios": 30,
+    "single_turn": 25,
+    "multi_turn": 5,
+    "categories": ["billing", "internet", "mobile", "account", "tv", "bundle", "support"]
+  },
+  "test_cases": [
+    {
+      "id": "billing_high_invoice",
+      "category": "billing",
+      "name": "Invoice Higher Than Usual",
+      "customer_query": "Hi, I'm customer 251. I noticed my last invoice was $150, which is much higher than usual. Can you help me understand why?",
+      "customer_id": 251,
+      "expected_tools": ["get_customer_detail", "get_billing_summary", "get_subscription_detail", "get_data_usage", "search_knowledge_base"],
+      "required_tools": ["get_billing_summary", "get_data_usage"],
+      "success_criteria": {
+        "must_access_billing": true,
+        "must_explain_charges": true
+      },
+      "ground_truth_solution": "The customer's invoice is $150 instead of the usual $60 because of data overage charges. Key facts: 1) The plan has a 10GB data cap, 2) Customer used 22GB (12GB over), 3) Overage charges of $7.50/GB apply, 4) Additional $90 explains the higher bill. Solutions: offer courtesy adjustment, recommend plan upgrade, set up usage alerts.",
+      "scoring_rubric": "5=Identifies overage (22GB vs 10GB), explains charges, offers adjustment AND upgrade; 4=Identifies overage, offers solution; 3=Identifies cause but missing details; 2=Vague explanation; 1=Incorrect or unhelpful",
+      "multi_turn": false
+    },
+    {
+      "id": "billing_payment_history",
+      "category": "billing",
+      "name": "Payment History Inquiry",
+      "customer_query": "Hi, I'm customer 5. Can you show me my recent payments? I want to make sure they all went through.",
+      "customer_id": 5,
+      "expected_tools": ["get_customer_detail", "get_billing_summary"],
+      "required_tools": ["get_billing_summary"],
+      "success_criteria": {
+        "must_access_billing": true
+      },
+      "ground_truth_solution": "Show payment history with dates, amounts, and methods. Confirm all payments were successful. Mention autopay option if not enabled. Offer to send payment receipts if needed.",
+      "scoring_rubric": "5=Shows history with dates/amounts/methods, confirms status; 4=Shows history and confirms; 3=Provides payment info but incomplete; 2=Vague response; 1=No payment info",
+      "multi_turn": false
+    },
+    {
+      "id": "billing_autopay_setup",
+      "category": "billing",
+      "name": "Autopay Setup Request",
+      "customer_query": "Hi, I'm customer 10. I keep forgetting to pay my bill on time. Can you help me set up autopay?",
+      "customer_id": 10,
+      "expected_tools": ["get_customer_detail", "get_billing_summary", "get_subscription_detail", "search_knowledge_base"],
+      "required_tools": ["get_billing_summary"],
+      "success_criteria": {
+        "must_access_billing": true,
+        "must_explain_autopay": true
+      },
+      "ground_truth_solution": "Check current autopay status. Explain $5 monthly discount for autopay. Guide through setup process. Confirm payment method on file.",
+      "scoring_rubric": "5=Checks status, mentions $5 discount, explains benefits, guides setup; 4=Explains benefits and setup; 3=Basic autopay info; 2=Generic response; 1=Doesn't help with autopay",
+      "multi_turn": false
+    },
+    {
+      "id": "billing_overdue_invoice",
+      "category": "billing",
+      "name": "Overdue Invoice Question",
+      "customer_query": "Hi, I'm customer 15. I received a notice about an overdue invoice. What happens if I don't pay soon?",
+      "customer_id": 15,
+      "expected_tools": ["get_customer_detail", "get_billing_summary", "search_knowledge_base"],
+      "required_tools": ["get_billing_summary"],
+      "success_criteria": {
+        "must_access_billing": true,
+        "must_explain_consequences": true
+      },
+      "ground_truth_solution": "Show overdue invoices with amounts and due dates. Explain late fee policy and potential service suspension after 30+ days. Offer payment plan if large amount. Process payment if customer wants.",
+      "scoring_rubric": "5=Shows overdue details, explains consequences, offers solutions; 4=Explains consequences and helps; 3=Addresses concern but missing specifics; 2=Generic response; 1=Doesn't address concern",
+      "multi_turn": false
+    },
+    {
+      "id": "billing_refund_request",
+      "category": "billing",
+      "name": "Refund Request for Service Issue",
+      "customer_query": "Hi, I'm customer 20. I was without internet for 3 days last week. Can I get a refund or credit for those days?",
+      "customer_id": 20,
+      "expected_tools": ["get_customer_detail", "get_support_tickets", "get_subscription_detail", "get_billing_summary"],
+      "required_tools": ["get_support_tickets", "get_billing_summary"],
+      "success_criteria": {
+        "must_verify_outage": true,
+        "must_offer_credit": true
+      },
+      "ground_truth_solution": "Verify outage via support tickets or service incidents. Calculate pro-rated credit for 3 days. Apply credit to next invoice. Apologize for inconvenience and confirm credit will appear on next bill.",
+      "scoring_rubric": "5=Verifies outage, calculates credit, applies and confirms; 4=Acknowledges and offers credit; 3=Offers help but missing verification; 2=Generic response; 1=Doesn't address refund",
+      "multi_turn": false
+    },
+    {
+      "id": "internet_slow",
+      "category": "internet",
+      "name": "Internet Slower Than Before",
+      "customer_query": "Hi, I'm customer 252. My internet has been really slow lately. I'm paying for 1Gbps but it feels much slower.",
+      "customer_id": 252,
+      "expected_tools": ["get_customer_detail", "get_subscription_detail", "get_support_tickets", "search_knowledge_base"],
+      "required_tools": ["get_subscription_detail"],
+      "success_criteria": {
+        "must_check_subscription": true,
+        "must_provide_troubleshooting": true
+      },
+      "ground_truth_solution": "Check subscription and service status. Look for existing service incidents. Acknowledge known issue if exists. Provide troubleshooting steps (restart router, check cables, test wired). Offer to escalate and mention potential service credit.",
+      "scoring_rubric": "5=Identifies incident, provides troubleshooting, offers escalation AND credit; 4=Identifies issue and troubleshoots; 3=Acknowledges and provides some steps; 2=Generic troubleshooting; 1=Unhelpful",
+      "multi_turn": false
+    },
+    {
+      "id": "internet_upgrade_inquiry",
+      "category": "internet",
+      "name": "Internet Speed Upgrade Options",
+      "customer_query": "Hi, I'm customer 25. I work from home and my current internet is too slow for video calls. What upgrade options do I have?",
+      "customer_id": 25,
+      "expected_tools": ["get_customer_detail", "get_subscription_detail", "get_products", "search_knowledge_base"],
+      "required_tools": ["get_subscription_detail", "get_products"],
+      "success_criteria": {
+        "must_check_subscription": true,
+        "must_show_products": true
+      },
+      "ground_truth_solution": "Check current plan. Show upgrade options: Basic (100Mbps/$49.99), Pro (500Mbps/$79.99), Ultimate (1Gbps/$119.99). Recommend Pro for video calls. Explain benefits, show price difference, mention promotions. Upgrades take effect within 24 hours.",
+      "scoring_rubric": "5=Shows current plan, presents options with pricing, recommends, mentions promos; 4=Shows options with pricing and recommends; 3=Lists options but missing personalization; 2=Generic product info; 1=No helpful upgrade info",
+      "multi_turn": false
+    },
+    {
+      "id": "internet_router_reset",
+      "category": "internet",
+      "name": "Router Reset Help",
+      "customer_query": "Hi, I'm customer 30. My router isn't working and I think I need to reset it. How do I do that?",
+      "customer_id": 30,
+      "expected_tools": ["get_customer_detail", "search_knowledge_base"],
+      "required_tools": ["search_knowledge_base"],
+      "success_criteria": {
+        "must_search_knowledge": true,
+        "must_provide_steps": true
+      },
+      "ground_truth_solution": "Provide step-by-step: 1) Locate reset button on back, 2) Use paperclip to press and hold 10 seconds, 3) Wait for router to restart (lights blink), 4) Returns to factory settings, 5) Reconnect using default WiFi on label. Offer technician if uncomfortable.",
+      "scoring_rubric": "5=Complete steps, mentions factory settings warning, offers additional help; 4=Provides reset steps and guidance; 3=Gives instructions but incomplete; 2=Vague instructions; 1=Doesn't help",
+      "multi_turn": false
+    },
+    {
+      "id": "internet_outage_report",
+      "category": "internet",
+      "name": "Internet Outage Report",
+      "customer_query": "Hi, I'm customer 35. My internet is completely down! Nothing is working. Is there an outage in my area?",
+      "customer_id": 35,
+      "expected_tools": ["get_customer_detail", "get_subscription_detail", "get_support_tickets", "search_knowledge_base"],
+      "required_tools": ["get_subscription_detail"],
+      "success_criteria": {
+        "must_check_subscription": true,
+        "must_check_incidents": true
+      },
+      "ground_truth_solution": "Check subscription service status and existing incidents. If outage confirmed: apologize, provide ETA, offer notification when restored. If no known outage: create support ticket, provide troubleshooting, offer technician visit, mention service credit for extended outages.",
+      "scoring_rubric": "5=Checks outage status, creates ticket if needed, provides ETA, offers follow-up; 4=Checks status and takes action; 3=Acknowledges and offers help; 2=Generic response; 1=Doesn't address outage",
+      "multi_turn": false
+    },
+    {
+      "id": "internet_static_ip",
+      "category": "internet",
+      "name": "Static IP Request",
+      "customer_query": "Hi, I'm customer 40. I need a static IP address for my home server. Do you offer that?",
+      "customer_id": 40,
+      "expected_tools": ["get_customer_detail", "get_subscription_detail", "get_products"],
+      "required_tools": ["get_subscription_detail", "get_products"],
+      "success_criteria": {
+        "must_check_subscription": true,
+        "must_explain_options": true
+      },
+      "ground_truth_solution": "Static IP included in: Pro ($79.99/month) - 1 static IP, Ultimate ($119.99/month) - 1 static IP, Business Enterprise ($299.99/month) - IP block. Basic plan does not include. Check current plan and recommend upgrade to Pro if on Basic.",
+      "scoring_rubric": "5=Checks plan, explains which include static IP, recommends option; 4=Explains availability and recommends upgrade; 3=Mentions static IP but missing plan details; 2=Generic response; 1=Doesn't address request",
+      "multi_turn": false
+    },
+    {
+      "id": "roaming_travel",
+      "category": "mobile",
+      "name": "Travelling Abroad - Needs Roaming",
+      "customer_query": "Hi, I'm customer 253. I'm traveling to Spain in 2 days and need to know about international roaming.",
+      "customer_id": 253,
+      "expected_tools": ["get_customer_detail", "get_subscription_detail", "get_products", "search_knowledge_base"],
+      "required_tools": ["get_subscription_detail"],
+      "success_criteria": {
+        "must_check_subscription": true,
+        "must_explain_roaming": true
+      },
+      "ground_truth_solution": "Roaming currently NOT enabled. Packages typically require 3 days to activate (cutting it close). Spain covered under European options. Urgently enable roaming, recommend appropriate package, warn about timeline, explain rates and usage alerts.",
+      "scoring_rubric": "5=Identifies roaming off, explains urgency, offers to enable AND recommends package; 4=Identifies status and urgency, offers to enable; 3=Identifies not enabled, offers to help; 2=Generic roaming info; 1=Doesn't address request",
+      "multi_turn": false
+    },
+    {
+      "id": "mobile_data_usage",
+      "category": "mobile",
+      "name": "Mobile Data Usage Check",
+      "customer_query": "Hi, I'm customer 45. How much data have I used this month? I don't want to go over my limit.",
+      "customer_id": 45,
+      "expected_tools": ["get_customer_detail", "get_data_usage", "get_subscription_detail"],
+      "required_tools": ["get_data_usage", "get_subscription_detail"],
+      "success_criteria": {
+        "must_check_usage": true,
+        "must_show_limit": true
+      },
+      "ground_truth_solution": "Show current data usage for billing cycle, data cap from plan, days remaining, percentage used. If close to limit: warn about overage, suggest data-saving tips, offer unlimited upgrade, explain usage alerts.",
+      "scoring_rubric": "5=Shows usage, cap, remaining, provides proactive advice; 4=Shows usage and limit clearly; 3=Provides data info; 2=Vague or incomplete; 1=Doesn't provide usage",
+      "multi_turn": false
+    },
+    {
+      "id": "mobile_upgrade_premium",
+      "category": "mobile",
+      "name": "Mobile Plan Upgrade",
+      "customer_query": "Hi, I'm customer 3. I keep running out of data. What mobile plans with more data do you have?",
+      "customer_id": 3,
+      "expected_tools": ["get_customer_detail", "get_subscription_detail", "get_products"],
+      "required_tools": ["get_subscription_detail", "get_products"],
+      "success_criteria": {
+        "must_check_subscription": true,
+        "must_show_products": true
+      },
+      "ground_truth_solution": "Current plan: Essential (5GB/$29.99). Recommend Premium ($59.99/month): unlimited data, international roaming, 5G Priority, 50GB hotspot. Explain $30/month price difference, highlight unlimited benefit, offer to process upgrade.",
+      "scoring_rubric": "5=Shows current plan, recommends Premium with pricing, highlights benefits; 4=Provides options with comparison; 3=Mentions options but missing details; 2=Generic product info; 1=Doesn't help",
+      "multi_turn": false
+    },
+    {
+      "id": "mobile_hotspot_question",
+      "category": "mobile",
+      "name": "Mobile Hotspot Inquiry",
+      "customer_query": "Hi, I'm customer 8. Does my mobile plan include hotspot? I need to use it for my laptop.",
+      "customer_id": 8,
+      "expected_tools": ["get_customer_detail", "get_subscription_detail", "get_products"],
+      "required_tools": ["get_subscription_detail"],
+      "success_criteria": {
+        "must_check_subscription": true
+      },
+      "ground_truth_solution": "Check current plan. Essential: Hotspot NOT included or limited. Premium: 50GB hotspot included. Explain availability based on their plan, provide usage info or offer Premium upgrade, give instructions on enabling if available.",
+      "scoring_rubric": "5=Checks plan, explains status, provides info or upgrade option; 4=Explains availability for their plan; 3=Addresses question generally; 2=Vague without checking plan; 1=Doesn't address hotspot",
+      "multi_turn": false
+    },
+    {
+      "id": "account_locked",
+      "category": "account",
+      "name": "Account Locked After Failed Logins",
+      "customer_query": "Hi, I'm customer 254. I can't log into my account - it says it's locked!",
+      "customer_id": 254,
+      "expected_tools": ["get_customer_detail", "get_security_logs", "unlock_account", "search_knowledge_base"],
+      "required_tools": ["get_security_logs", "unlock_account"],
+      "success_criteria": {
+        "must_check_security_logs": true,
+        "must_unlock_account": true
+      },
+      "ground_truth_solution": "Security logs show multiple failed login attempts triggering lockout. Security feature to protect account. Verify identity, unlock account using unlock_account tool, confirm accessible. Suggest password reset, recommend 2FA, advise password manager.",
+      "scoring_rubric": "5=Verifies identity, unlocks, confirms, provides security recommendations (password, 2FA); 4=Unlocks and provides one recommendation; 3=Unlocks and confirms; 2=Attempts but doesn't unlock; 1=Doesn't address lockout",
+      "multi_turn": false
+    },
+    {
+      "id": "account_security_check",
+      "category": "account",
+      "name": "Security Audit Request",
+      "customer_query": "Hi, I'm customer 12. I heard about data breaches in the news. Can you check if my account is secure?",
+      "customer_id": 12,
+      "expected_tools": ["get_customer_detail", "get_security_logs", "search_knowledge_base"],
+      "required_tools": ["get_security_logs"],
+      "success_criteria": {
+        "must_check_security_logs": true,
+        "must_provide_recommendations": true
+      },
+      "ground_truth_solution": "Review security logs for suspicious activity, failed logins from unknown locations, unauthorized access. Provide recommendations: enable 2FA, use strong unique password, update every 90 days, never share credentials, monitor account. Reassure and explain security measures.",
+      "scoring_rubric": "5=Reviews logs, reports findings, comprehensive recommendations; 4=Checks status and provides recommendations; 3=Reviews but limited recommendations; 2=Generic advice without checking; 1=Doesn't address concern",
+      "multi_turn": false
+    },
+    {
+      "id": "account_update_contact",
+      "category": "account",
+      "name": "Update Contact Information",
+      "customer_query": "Hi, I'm customer 18. I have a new email and phone number. Can you update my account information?",
+      "customer_id": 18,
+      "expected_tools": ["get_customer_detail"],
+      "required_tools": ["get_customer_detail"],
+      "success_criteria": {
+        "must_access_customer": true
+      },
+      "ground_truth_solution": "Retrieve current contact details. Verify identity. Collect new email and phone. Explain verification process for new contact info. Note: new email may require verification, update affects notifications/billing alerts, password reset links go to email on file.",
+      "scoring_rubric": "5=Shows current info, requests new details, explains verification, updates preferences; 4=Helps with update and explains process; 3=Acknowledges and provides guidance; 2=Generic without checking info; 1=Doesn't help",
+      "multi_turn": false
+    },
+    {
+      "id": "account_paperless_billing",
+      "category": "account",
+      "name": "Paperless Billing Setup",
+      "customer_query": "Hi, I'm customer 22. I want to go paperless and stop receiving paper bills. How do I do that?",
+      "customer_id": 22,
+      "expected_tools": ["get_customer_detail", "search_knowledge_base"],
+      "required_tools": ["get_customer_detail"],
+      "success_criteria": {
+        "must_access_customer": true
+      },
+      "ground_truth_solution": "Check current billing preferences. Verify email on file. Enable paperless billing. Confirm: bills sent to email, paper stops within 1-2 cycles, can view all bills online, email notifications for new bills.",
+      "scoring_rubric": "5=Checks settings, confirms email, enables paperless, explains benefits; 4=Enables and confirms; 3=Provides guidance; 2=Generic without checking; 1=Doesn't help",
+      "multi_turn": false
+    },
+    {
+      "id": "tv_channel_lineup",
+      "category": "tv",
+      "name": "TV Channel Lineup Question",
+      "customer_query": "Hi, I'm customer 28. What channels do I get with my TV streaming plan?",
+      "customer_id": 28,
+      "expected_tools": ["get_customer_detail", "get_subscription_detail", "get_products"],
+      "required_tools": ["get_subscription_detail"],
+      "success_criteria": {
+        "must_check_subscription": true
+      },
+      "ground_truth_solution": "TV plans: Basic ($34.99/month): 50+ channels, 2 screens, 7-day replay. Premium ($64.99/month): 150+ channels, 4 screens, 30-day replay, sports, movies. Check current subscription, list features, mention upgrade if on Basic, explain streaming app access.",
+      "scoring_rubric": "5=Shows plan details, lists features, mentions upgrade if applicable; 4=Explains channels and features; 3=Provides plan info; 2=Generic TV info; 1=Doesn't address question",
+      "multi_turn": false
+    },
+    {
+      "id": "tv_add_sports",
+      "category": "tv",
+      "name": "Add Sports Package",
+      "customer_query": "Hi, I'm customer 32. I want to watch football games. Do you have a sports package I can add?",
+      "customer_id": 32,
+      "expected_tools": ["get_customer_detail", "get_subscription_detail", "get_products"],
+      "required_tools": ["get_subscription_detail", "get_products"],
+      "success_criteria": {
+        "must_check_subscription": true,
+        "must_show_products": true
+      },
+      "ground_truth_solution": "Sports included in TV Streaming Premium ($64.99/month). Basic does not include sports. Check current subscription. If on Basic, offer upgrade to Premium which includes sports plus movie channels, 4 screens, 30-day replay. Calculate price difference.",
+      "scoring_rubric": "5=Checks plan, explains sports in Premium, shows pricing, offers upgrade; 4=Explains availability and upgrade; 3=Mentions sports info; 2=Generic without checking plan; 1=Doesn't help",
+      "multi_turn": false
+    },
+    {
+      "id": "bundle_inquiry",
+      "category": "bundle",
+      "name": "Bundle Package Inquiry",
+      "customer_query": "Hi, I'm customer 38. I have internet and mobile separately. Would I save money if I bundle them?",
+      "customer_id": 38,
+      "expected_tools": ["get_customer_detail", "get_subscription_detail", "get_products"],
+      "required_tools": ["get_subscription_detail", "get_products"],
+      "success_criteria": {
+        "must_check_subscription": true,
+        "must_show_products": true
+      },
+      "ground_truth_solution": "Bundle - Family Complete: $199.99/month includes 500Mbps Internet, 150+ TV Channels, 2 Unlimited Mobile Lines, 20% discount vs individual. Check current subscriptions and total cost, calculate potential savings, explain bundle includes more, show value, offer to switch.",
+      "scoring_rubric": "5=Shows current cost, calculates savings, explains benefits, offers switch; 4=Explains options and potential savings; 3=Provides bundle info; 2=Generic bundle info; 1=Doesn't help",
+      "multi_turn": false
+    },
+    {
+      "id": "promotion_eligibility",
+      "category": "bundle",
+      "name": "Promotion Eligibility Check",
+      "customer_query": "Hi, I'm customer 42. Are there any promotions or discounts I'm eligible for?",
+      "customer_id": 42,
+      "expected_tools": ["get_customer_detail", "get_subscription_detail", "get_eligible_promotions"],
+      "required_tools": ["get_customer_detail", "get_eligible_promotions"],
+      "success_criteria": {
+        "must_access_customer": true,
+        "must_check_promotions": true
+      },
+      "ground_truth_solution": "Available promotions: 1) New Customer 20% off (if new), 2) Bundle & Save $50/month (if 3+ services), 3) Loyalty Reward free speed upgrade (if Gold/Platinum), 4) Refer a Friend $100 credit. Check loyalty level and services, identify applicable promotions, explain how to take advantage.",
+      "scoring_rubric": "5=Checks eligibility, lists applicable promos, explains how to apply; 4=Identifies promotions customer qualifies for; 3=Mentions available promotions; 2=Generic list without checking; 1=Doesn't help",
+      "multi_turn": false
+    },
+    {
+      "id": "loyalty_benefits",
+      "category": "bundle",
+      "name": "Loyalty Program Benefits",
+      "customer_query": "Hi, I'm customer 48. I've been with you for years. What loyalty benefits do I get?",
+      "customer_id": 48,
+      "expected_tools": ["get_customer_detail", "get_products", "search_knowledge_base"],
+      "required_tools": ["get_customer_detail"],
+      "success_criteria": {
+        "must_access_customer": true
+      },
+      "ground_truth_solution": "Loyalty tiers: Bronze (basic support), Silver (priority support, occasional discounts), Gold (24/7 VIP support, free speed upgrades, special promotions), Platinum (all Gold plus dedicated account manager). Check current level, explain tier benefits, mention how to reach next tier, highlight current Gold/Platinum promotion.",
+      "scoring_rubric": "5=Shows level, explains tier benefits, mentions upgrade path and promos; 4=Explains benefits for their tier; 3=Provides loyalty info; 2=Generic without checking level; 1=Doesn't address question",
+      "multi_turn": false
+    },
+    {
+      "id": "support_ticket_status",
+      "category": "support",
+      "name": "Support Ticket Status Check",
+      "customer_query": "Hi, I'm customer 6. I opened a support ticket a few days ago. Can you check the status?",
+      "customer_id": 6,
+      "expected_tools": ["get_customer_detail", "get_support_tickets"],
+      "required_tools": ["get_support_tickets"],
+      "success_criteria": {
+        "must_check_tickets": true
+      },
+      "ground_truth_solution": "Look up open/pending tickets. Provide ticket number and status. Explain current stage of resolution. Provide expected timeline. If pending: explain what's being done, offer to escalate if delayed, provide contact for urgent issues.",
+      "scoring_rubric": "5=Finds ticket, shows status, explains next steps, offers escalation; 4=Provides status and explanation; 3=Finds and reports status; 2=Generic without checking; 1=Doesn't help",
+      "multi_turn": false
+    },
+    {
+      "id": "support_new_ticket",
+      "category": "support",
+      "name": "Create New Support Ticket",
+      "customer_query": "Hi, I'm customer 14. My cable box keeps rebooting randomly. I need someone to look at this.",
+      "customer_id": 14,
+      "expected_tools": ["get_customer_detail", "get_subscription_detail", "get_support_tickets"],
+      "required_tools": ["get_subscription_detail"],
+      "success_criteria": {
+        "must_check_subscription": true,
+        "must_offer_support": true
+      },
+      "ground_truth_solution": "Document cable box issue (random reboots). Check subscription for equipment details. Basic troubleshooting: unplug 30 seconds, check connections. If persists, create support ticket with equipment type, issue description, troubleshooting attempted, priority level. Offer technician visit.",
+      "scoring_rubric": "5=Documents issue, tries troubleshooting, creates ticket, offers technician; 4=Creates ticket and offers options; 3=Acknowledges and offers help; 2=Generic troubleshooting without ticket; 1=Doesn't address issue",
+      "multi_turn": false
+    },
+    {
+      "id": "multi_billing_dispute",
+      "category": "billing",
+      "name": "[Multi-Turn] Billing Dispute Resolution",
+      "multi_turn": true,
+      "turns": [
+        {
+          "turn_number": 1,
+          "customer_query": "Hi, I'm customer 7. There's a $50 charge on my bill I don't recognize. What is this for?",
+          "expected_tools": ["get_billing_summary"],
+          "expected_response_elements": ["charge", "invoice", "billing"]
+        },
+        {
+          "turn_number": 2,
+          "customer_query": "I didn't order any equipment or additional services. Can you remove this charge?",
+          "expected_tools": [],
+          "expected_response_elements": ["credit", "remove", "adjustment"]
+        },
+        {
+          "turn_number": 3,
+          "customer_query": "Thanks for the credit. While I have you, are there any promotions I qualify for?",
+          "expected_tools": ["get_customer_detail", "get_eligible_promotions"],
+          "expected_response_elements": ["promotion", "discount", "offer"]
+        }
+      ],
+      "customer_id": 7,
+      "expected_tools": ["get_customer_detail", "get_billing_summary", "get_subscription_detail", "get_eligible_promotions"],
+      "required_tools": ["get_billing_summary"],
+      "success_criteria": {
+        "must_access_billing": true,
+        "must_handle_credit": true,
+        "must_check_promotions": true
+      },
+      "ground_truth_solution": "Turn 1: Pull billing summary, identify $50 charge, explain what it's for. Turn 2: If erroneous, apply credit; if valid, explain but offer goodwill credit if appropriate, confirm adjustment on next bill. Turn 3: Review loyalty level and services, identify applicable promotions, recommend best options.",
+      "scoring_rubric": "5=Investigates thoroughly, handles credit appropriately, provides personalized promo info; 4=Addresses each turn adequately; 3=Responds but missing depth; 2=Misses context between turns; 1=Fails to address dispute or loses context"
+    },
+    {
+      "id": "multi_internet_troubleshoot",
+      "category": "internet",
+      "name": "[Multi-Turn] Internet Troubleshooting Flow",
+      "multi_turn": true,
+      "turns": [
+        {
+          "turn_number": 1,
+          "customer_query": "Hi, I'm customer 16. My internet keeps dropping every few minutes. It's really frustrating.",
+          "expected_tools": ["get_subscription_detail", "get_support_tickets"],
+          "expected_response_elements": ["internet", "issue", "connection"]
+        },
+        {
+          "turn_number": 2,
+          "customer_query": "I already tried restarting the router. It worked for a bit but started dropping again.",
+          "expected_tools": ["search_knowledge_base"],
+          "expected_response_elements": ["troubleshoot", "check", "cable"]
+        },
+        {
+          "turn_number": 3,
+          "customer_query": "I checked the cables and they look fine. I think there might be something wrong with the equipment.",
+          "expected_tools": [],
+          "expected_response_elements": ["technician", "appointment", "visit"]
+        },
+        {
+          "turn_number": 4,
+          "customer_query": "Yes, please schedule a technician. What times are available?",
+          "expected_tools": [],
+          "expected_response_elements": ["scheduled", "appointment", "confirm"]
+        }
+      ],
+      "customer_id": 16,
+      "expected_tools": ["get_customer_detail", "get_subscription_detail", "get_support_tickets", "search_knowledge_base"],
+      "required_tools": ["get_subscription_detail"],
+      "success_criteria": {
+        "must_check_subscription": true,
+        "must_provide_troubleshooting": true,
+        "must_offer_technician": true
+      },
+      "ground_truth_solution": "Turn 1: Check subscription/incidents, acknowledge issue. Turn 2: Since router restart tried, suggest cable check, wired connection test, interference check. Turn 3: Acknowledge customer tried troubleshooting, agree equipment may need inspection, offer technician. Turn 4: Offer time slots, confirm details, provide arrival window.",
+      "scoring_rubric": "5=Progressive troubleshooting, builds on previous turns, smooth escalation; 4=Addresses each step, schedules technician; 3=Follows flow but may skip steps; 2=Repetitive or doesn't build on attempts; 1=Doesn't progress logically"
+    },
+    {
+      "id": "multi_service_cancellation",
+      "category": "account",
+      "name": "[Multi-Turn] Service Cancellation Retention",
+      "multi_turn": true,
+      "turns": [
+        {
+          "turn_number": 1,
+          "customer_query": "Hi, I'm customer 24. I want to cancel my internet service. It's too expensive.",
+          "expected_tools": ["get_subscription_detail", "get_billing_summary"],
+          "expected_response_elements": ["cancel", "service", "understand"]
+        },
+        {
+          "turn_number": 2,
+          "customer_query": "I've been paying $119 a month and I found a competitor offering $70 for similar speeds.",
+          "expected_tools": ["get_products"],
+          "expected_response_elements": ["offer", "discount", "match", "retention"]
+        },
+        {
+          "turn_number": 3,
+          "customer_query": "A 20% discount sounds good. What would my new monthly rate be?",
+          "expected_tools": [],
+          "expected_response_elements": ["$95", "monthly", "rate", "discount"]
+        }
+      ],
+      "customer_id": 24,
+      "expected_tools": ["get_customer_detail", "get_subscription_detail", "get_billing_summary", "get_products"],
+      "required_tools": ["get_subscription_detail", "get_billing_summary"],
+      "success_criteria": {
+        "must_check_subscription": true,
+        "must_attempt_retention": true
+      },
+      "ground_truth_solution": "Turn 1: Pull subscription/billing, express understanding, ask about needs, don't immediately accept cancellation. Turn 2: Acknowledge competitor pricing, check retention offers, offer 20% loyalty discount or price match, highlight value-adds. Turn 3: Calculate new rate ($119 x 0.8 = $95.20), confirm discount applied, explain duration, thank for staying.",
+      "scoring_rubric": "5=Empathetic handling, competitive retention offer, calculates rate, secures retention; 4=Makes appropriate offer and calculates; 3=Attempts retention but may miss personalization; 2=Too quick to cancel or weak retention; 1=Processes cancellation without effort"
+    },
+    {
+      "id": "multi_new_customer_setup",
+      "category": "internet",
+      "name": "[Multi-Turn] New Service Setup Assistance",
+      "multi_turn": true,
+      "turns": [
+        {
+          "turn_number": 1,
+          "customer_query": "Hi, I'm customer 2. I just moved to a new apartment and need to set up internet. What are my options?",
+          "expected_tools": ["get_products"],
+          "expected_response_elements": ["internet", "plans", "options"]
+        },
+        {
+          "turn_number": 2,
+          "customer_query": "I work from home and need reliable internet for video calls. Which plan do you recommend?",
+          "expected_tools": ["get_subscription_detail"],
+          "expected_response_elements": ["Pro", "500Mbps", "recommend"]
+        },
+        {
+          "turn_number": 3,
+          "customer_query": "The Pro plan sounds good. Do you have any current promotions for new setups?",
+          "expected_tools": ["get_eligible_promotions"],
+          "expected_response_elements": ["promotion", "discount", "new customer"]
+        },
+        {
+          "turn_number": 4,
+          "customer_query": "Great! Please set me up with the Pro plan and the new customer discount.",
+          "expected_tools": [],
+          "expected_response_elements": ["confirm", "order", "setup", "welcome"]
+        }
+      ],
+      "customer_id": 2,
+      "expected_tools": ["get_customer_detail", "get_products", "get_subscription_detail", "get_eligible_promotions"],
+      "required_tools": ["get_products"],
+      "success_criteria": {
+        "must_show_products": true,
+        "must_recommend_plan": true,
+        "must_check_promotions": true
+      },
+      "ground_truth_solution": "Turn 1: List plans (Basic, Pro, Ultimate) with speeds and pricing. Turn 2: Recommend Pro (500Mbps) for WFH video calls, explain why suitable. Turn 3: New Customer 20% off first 3 months, WiFi 6 router included, installation options. Turn 4: Confirm Pro @ $79.99, apply 20% (first 3 months = $63.99), set installation, welcome.",
+      "scoring_rubric": "5=Natural sales flow, personalized recommendation, applies promo, completes smoothly; 4=Guides through selection and setup; 3=Completes but may lack personalization; 2=Disjointed or missing steps; 1=Doesn't complete setup"
+    },
+    {
+      "id": "multi_complex_account_issue",
+      "category": "account",
+      "name": "[Multi-Turn] Complex Account Resolution",
+      "multi_turn": true,
+      "turns": [
+        {
+          "turn_number": 1,
+          "customer_query": "Hi, I'm customer 11. I have several issues. First, I was charged for a service I cancelled last month.",
+          "expected_tools": ["get_billing_summary", "get_subscription_detail"],
+          "expected_response_elements": ["charge", "cancelled", "billing"]
+        },
+        {
+          "turn_number": 2,
+          "customer_query": "Also, my internet has been slow for the past week. Are there any known issues?",
+          "expected_tools": ["get_support_tickets"],
+          "expected_response_elements": ["slow", "internet", "incident", "issue"]
+        },
+        {
+          "turn_number": 3,
+          "customer_query": "One more thing - I want to downgrade my TV package. I don't watch that much anymore.",
+          "expected_tools": ["get_products"],
+          "expected_response_elements": ["downgrade", "TV", "package", "change"]
+        },
+        {
+          "turn_number": 4,
+          "customer_query": "Can you summarize all the changes you're making to my account?",
+          "expected_tools": [],
+          "expected_response_elements": ["summary", "credit", "downgrade", "changes"]
+        }
+      ],
+      "customer_id": 11,
+      "expected_tools": ["get_customer_detail", "get_billing_summary", "get_subscription_detail", "get_support_tickets", "get_products"],
+      "required_tools": ["get_billing_summary", "get_subscription_detail"],
+      "success_criteria": {
+        "must_access_billing": true,
+        "must_check_subscription": true,
+        "must_handle_multiple_issues": true
+      },
+      "ground_truth_solution": "Turn 1: Check billing for cancelled service charge, identify erroneous charge, apply credit/refund. Turn 2: Check service incidents and subscription status, provide status/ETA or troubleshooting. Turn 3: Show current TV package, explain downgrade options (Premium to Basic), calculate savings, process change. Turn 4: Recap all changes: credit applied, internet issue status, TV downgrade and savings.",
+      "scoring_rubric": "5=Handles all 3 issues effectively, provides clear summary, maintains context; 4=Addresses all with reasonable resolution; 3=Handles most but may miss one or lack summary; 2=Loses track or incomplete resolution; 1=Unable to handle multiple issues"
+    }
+  ]
+}
diff --git a/agentic_ai/evaluations/evaluator.py b/agentic_ai/evaluations/evaluator.py
new file mode 100644
index 000000000..c0028f1bf
--- /dev/null
+++ b/agentic_ai/evaluations/evaluator.py
@@ -0,0 +1,458 @@
+"""
+Evaluation runner for AI Agent testing.
+Tests agents against the evaluation dataset and generates reports.
+Supports multi-turn conversations and Azure AI Foundry evaluators.
+"""
+
+import json
+import os
+from typing import Dict, List, Any, Optional
+from datetime import datetime
+from dataclasses import dataclass, asdict, field
+import sys
+
+from metrics import (
+    ToolBehaviorEvaluator,
+    CompletenessEvaluator,
+    ResponseQualityEvaluator,
+    GroundedAccuracyEvaluator,
+    AzureAIEvaluatorSuite,
+    EvaluationResult,
+    AZURE_EVALUATORS_AVAILABLE,
+)
+
+
+@dataclass
+class AgentTrace:
+    """Captured trace of agent execution."""
+    query: str
+    response: str
+    tool_calls: List[Dict[str, Any]]
+    metadata: Dict[str, Any]
+
+
+@dataclass
+class ConversationTurn:
+    """A single turn in a multi-turn conversation."""
+    query: str
+    response: str
+    tool_calls: List[Dict[str, Any]] = field(default_factory=list)
+
+
+@dataclass
+class MultiTurnTrace:
+    """Captured trace of a multi-turn conversation."""
+    turns: List[ConversationTurn]
+    metadata: Dict[str, Any]
+    
+    @property
+    def full_response(self) -> str:
+        """Concatenate all responses for evaluation."""
+        return "\n\n".join(t.response for t in self.turns)
+    
+    @property
+    def all_tool_calls(self) -> List[Dict[str, Any]]:
+        """Aggregate all tool calls across turns."""
+        return [call for turn in self.turns for call in turn.tool_calls]
+    
+    @property
+    def first_query(self) -> str:
+        """Get the first query for matching."""
+        return self.turns[0].query if self.turns else ""
+
+
+@dataclass
+class TestCaseResult:
+    """Result of evaluating a single test case."""
+    test_case_id: str
+    query: str
+    agent_response: str
+    metrics: List[EvaluationResult]
+    overall_score: float
+    passed: bool
+    timestamp: str
+    is_multi_turn: bool = False
+    turn_count: int = 1
+
+
+class AgentEvaluationRunner:
+    """Main evaluation runner for agent testing."""
+    
+    # Weights for SINGLE-TURN evaluation (tool-focused)
+    SINGLE_TURN_WEIGHTS = {
+        "tool_behavior": 0.10,
+        "tool_call_accuracy": 0.15,
+        "task_adherence": 0.10,
+        "completeness": 0.10,
+        "response_quality_llm": 0.10,
+        "response_quality_basic": 0.05,
+        "grounded_accuracy": 0.10,
+        "intent_resolution": 0.10,
+        "coherence": 0.05,
+        "fluency": 0.05,
+        "relevance": 0.05,
+        "solution_accuracy": 0.10,
+    }
+    
+    # Weights for MULTI-TURN evaluation (outcome-focused)
+    # De-emphasizes tool-level metrics, focuses on overall outcome
+    MULTI_TURN_WEIGHTS = {
+        "solution_accuracy": 0.30,      # Did we achieve the right outcome?
+        "task_adherence": 0.20,         # Did we follow proper procedures?
+        "intent_resolution": 0.20,      # Were all intents resolved?
+        "coherence": 0.10,              # Was conversation logical?
+        "fluency": 0.10,                # Was communication quality good?
+        "relevance": 0.10,              # Were responses relevant throughout?
+        # Tool metrics excluded from multi-turn
+    }
+    
+    def __init__(
+        self,
+        dataset_path: str = "eval_dataset.json",
+        azure_openai_client=None,
+        use_azure_evaluators: bool = True,
+    ):
+        """
+        Initialize evaluation runner.
+        
+        Args:
+            dataset_path: Path to evaluation dataset JSON
+            azure_openai_client: Optional Azure OpenAI client for LLM-as-judge
+            use_azure_evaluators: Whether to use Azure AI Foundry evaluators
+        """
+        self.dataset_path = dataset_path
+        self.test_cases = self._load_dataset()
+        self.llm_client = azure_openai_client
+        
+        # Initialize evaluators
+        self.tool_evaluator = ToolBehaviorEvaluator()
+        self.completeness_evaluator = CompletenessEvaluator()
+        self.quality_evaluator = ResponseQualityEvaluator(azure_openai_client)
+        self.accuracy_evaluator = GroundedAccuracyEvaluator(azure_openai_client)
+        
+        # Initialize Azure AI evaluators if available and enabled
+        self.azure_evaluators = None
+        if use_azure_evaluators and AZURE_EVALUATORS_AVAILABLE:
+            self.azure_evaluators = AzureAIEvaluatorSuite()
+            if not self.azure_evaluators.available:
+                self.azure_evaluators = None
+    
+    def _load_dataset(self) -> List[Dict[str, Any]]:
+        """Load evaluation dataset from JSON."""
+        with open(self.dataset_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        return data.get("test_cases", [])
+    
+    def evaluate_agent_response(
+        self,
+        test_case: Dict[str, Any],
+        agent_trace: AgentTrace,
+    ) -> TestCaseResult:
+        """
+        Evaluate a single agent response against test case.
+        
+        Args:
+            test_case: Test case from dataset
+            agent_trace: Captured agent execution trace
+            
+        Returns:
+            TestCaseResult with all evaluation metrics
+        """
+        metrics: List[EvaluationResult] = []
+        is_multi_turn = test_case.get("multi_turn", False)
+        
+        # 1. Evaluate tool usage
+        tool_names = [call.get("name", "") for call in agent_trace.tool_calls]
+        
+        # Use required_tools if specified, otherwise fall back to expected_tools
+        required_tools = test_case.get("required_tools")
+        if required_tools is None:
+            required_tools = test_case.get("expected_tools", [])
+        
+        tool_result = self.tool_evaluator.evaluate(
+            expected_tools=test_case.get("expected_tools", []),
+            actual_tools=tool_names,
+            required_tools=required_tools
+        )
+        metrics.append(tool_result)
+        
+        # 2. Evaluate completeness
+        completeness_result = self.completeness_evaluator.evaluate(
+            success_criteria=test_case.get("success_criteria", {}),
+            tool_calls=agent_trace.tool_calls
+        )
+        metrics.append(completeness_result)
+        
+        # 3. Evaluate response quality  
+        tool_summary = f"Tools used: {', '.join(tool_names)}" if tool_names else "No tools used"
+        quality_result = self.quality_evaluator.evaluate(
+            query=agent_trace.query,
+            response=agent_trace.response,
+            tool_summary=tool_summary
+        )
+        metrics.append(quality_result)
+        
+        # 4. Evaluate accuracy (if ground truth available)
+        tool_outputs = "; ".join([str(call.get("result", "")) for call in agent_trace.tool_calls if call.get("result")])
+        accuracy_result = self.accuracy_evaluator.evaluate(
+            response=agent_trace.response,
+            tool_outputs=tool_outputs if tool_outputs else None
+        )
+        metrics.append(accuracy_result)
+        
+        # 5. Azure AI Foundry evaluators (if available)
+        if self.azure_evaluators:
+            azure_results = self.azure_evaluators.evaluate_all(
+                query=agent_trace.query,
+                response=agent_trace.response,
+                ground_truth=test_case.get("ground_truth_solution"),
+                scoring_rubric=test_case.get("scoring_rubric"),
+                tool_calls=agent_trace.tool_calls if not is_multi_turn else None,  # Skip tool eval for multi-turn
+                llm_client=self.llm_client,
+            )
+            metrics.extend(azure_results)
+        
+        # Use different weights based on single-turn vs multi-turn
+        if is_multi_turn:
+            weights = self.MULTI_TURN_WEIGHTS
+            # For multi-turn, only require outcome metrics to pass
+            required_pass_metrics = []  # No strict requirements, use overall score
+        else:
+            weights = self.SINGLE_TURN_WEIGHTS
+            required_pass_metrics = ["tool_behavior", "completeness"]
+        
+        total_score = 0.0
+        total_weight = 0.0
+        
+        for metric in metrics:
+            weight = weights.get(metric.metric_name, 0.0)  # 0 weight = excluded
+            if weight > 0:
+                total_score += metric.score * weight
+                total_weight += weight
+        
+        # Overall score is weighted average (on 1-5 scale)
+        overall_score = total_score / total_weight if total_weight > 0 else 0.0
+        # Threshold: 3/5 to pass
+        if is_multi_turn:
+            passed = overall_score >= 3.0  # Outcome-based pass for multi-turn
+        else:
+            passed = overall_score >= 3.0 and all(m.passed for m in metrics if m.metric_name in required_pass_metrics)
+        
+        return TestCaseResult(
+            test_case_id=test_case.get("id", "unknown"),
+            query=agent_trace.query,
+            agent_response=agent_trace.response,
+            metrics=metrics,
+            overall_score=overall_score,
+            passed=passed,
+            timestamp=datetime.now().isoformat(),
+            is_multi_turn=is_multi_turn,
+            turn_count=len(test_case.get("turns", [])) if is_multi_turn else 1,
+        )
+    
+    def run_evaluation(
+        self,
+        agent_traces: List[AgentTrace],
+        output_dir: str = "eval_results"
+    ) -> Dict[str, Any]:
+        """
+        Run evaluation on all agent traces.
+        
+        Args:
+            agent_traces: List of captured agent execution traces
+            output_dir: Directory to save evaluation results
+            
+        Returns:
+            Summary of evaluation results
+        """
+        os.makedirs(output_dir, exist_ok=True)
+        
+        results: List[TestCaseResult] = []
+        
+        # Match traces to test cases
+        for test_case in self.test_cases:
+            # Find matching trace by test_id in metadata or by query
+            matching_trace = None
+            test_id = test_case.get("id", "")
+            
+            # Get customer query - for multi-turn, use first turn's query
+            if test_case.get("multi_turn", False):
+                turns = test_case.get("turns", [])
+                customer_query = turns[0]["customer_query"] if turns else ""
+            else:
+                customer_query = test_case.get("customer_query", "")
+            
+            for trace in agent_traces:
+                # Match by test_id in metadata first
+                if trace.metadata.get("test_id") == test_id:
+                    matching_trace = trace
+                    break
+                # Fallback to query matching
+                if customer_query and trace.query.lower().strip() == customer_query.lower().strip():
+                    matching_trace = trace
+                    break
+            
+            if not matching_trace:
+                print(f"⚠ Warning: No trace found for test case {test_case['id']}")
+                continue
+            
+            # Evaluate
+            result = self.evaluate_agent_response(test_case, matching_trace)
+            results.append(result)
+            
+            # Print progress
+            status = "✓ PASS" if result.passed else "✗ FAIL"
+            print(f"{status} {result.test_case_id}: {result.overall_score:.2f}")
+        
+        # Generate summary
+        summary = self._generate_summary(results)
+        
+        # Save results
+        self._save_results(results, summary, output_dir)
+        
+        return summary
+    
+    def _generate_summary(self, results: List[TestCaseResult]) -> Dict[str, Any]:
+        """Generate summary statistics."""
+        total = len(results)
+        passed = sum(1 for r in results if r.passed)
+        
+        avg_score = sum(r.overall_score for r in results) / total if total > 0 else 0.0
+        
+        # Metric breakdowns
+        metric_scores = {}
+        for result in results:
+            for metric in result.metrics:
+                if metric.metric_name not in metric_scores:
+                    metric_scores[metric.metric_name] = []
+                metric_scores[metric.metric_name].append(metric.score)
+        
+        metric_averages = {
+            name: sum(scores) / len(scores) if scores else 0.0
+            for name, scores in metric_scores.items()
+        }
+        
+        return {
+            "timestamp": datetime.now().isoformat(),
+            "total_tests": total,
+            "passed": passed,
+            "failed": total - passed,
+            "pass_rate": passed / total if total > 0 else 0.0,
+            "average_score": avg_score,
+            "metric_averages": metric_averages
+        }
+    
+    def _save_results(
+        self,
+        results: List[TestCaseResult],
+        summary: Dict[str, Any],
+        output_dir: str
+    ):
+        """Save evaluation results to files."""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        
+        # Save detailed results
+        results_file = os.path.join(output_dir, f"eval_results_{timestamp}.json")
+        with open(results_file, 'w') as f:
+            json.dump({
+                "results": [self._result_to_dict(r) for r in results],
+                "summary": summary
+            }, f, indent=2)
+        
+        # Save summary report
+        report_file = os.path.join(output_dir, f"eval_report_{timestamp}.txt")
+        with open(report_file, 'w', encoding='utf-8') as f:
+            f.write(self._generate_text_report(results, summary))
+        
+        print(f"\n✓ Results saved to: {results_file}")
+        print(f"✓ Report saved to: {report_file}")
+    
+    def _result_to_dict(self, result: TestCaseResult) -> Dict[str, Any]:
+        """Convert TestCaseResult to dictionary."""
+        return {
+            "test_case_id": result.test_case_id,
+            "query": result.query,
+            "agent_response": result.agent_response,
+            "overall_score": result.overall_score,
+            "passed": result.passed,
+            "timestamp": result.timestamp,
+            "metrics": [
+                {
+                    "name": m.metric_name,
+                    "type": m.metric_type.value,
+                    "score": m.score,
+                    "passed": m.passed,
+                    "explanation": m.explanation,
+                    "details": m.details
+                }
+                for m in result.metrics
+            ]
+        }
+    
+    def _generate_text_report(
+        self,
+        results: List[TestCaseResult],
+        summary: Dict[str, Any]
+    ) -> str:
+        """Generate human-readable text report."""
+        lines = []
+        lines.append("=" * 80)
+        lines.append("AI AGENT EVALUATION REPORT")
+        lines.append("=" * 80)
+        lines.append(f"\nTimestamp: {summary['timestamp']}")
+        lines.append(f"Total Tests: {summary['total_tests']}")
+        lines.append(f"Passed: {summary['passed']}")
+        lines.append(f"Failed: {summary['failed']}")
+        lines.append(f"Pass Rate: {summary['pass_rate']:.1%}")
+        lines.append(f"Average Score: {summary['average_score']:.2f}")
+        
+        lines.append("\n" + "=" * 80)
+        lines.append("METRIC AVERAGES")
+        lines.append("=" * 80)
+        for metric, avg in summary['metric_averages'].items():
+            lines.append(f"{metric:30s}: {avg:.2f}")
+        
+        lines.append("\n" + "=" * 80)
+        lines.append("DETAILED RESULTS")
+        lines.append("=" * 80)
+        
+        for result in results:
+            status = "✓ PASS" if result.passed else "✗ FAIL"
+            lines.append(f"\n{status} {result.test_case_id} (Score: {result.overall_score:.2f})")
+            lines.append(f"Query: {result.query}")
+            lines.append("\nMetrics:")
+            for metric in result.metrics:
+                lines.append(f"  - {metric.metric_name}: {metric.score:.2f} - {metric.explanation}")
+        
+        return "\n".join(lines)
+
+
+def example_usage():
+    """Example of how to use the evaluation runner."""
+    
+    # This is an example - in practice, you'd capture real agent traces
+    example_traces = [
+        AgentTrace(
+            query="I noticed my last invoice was higher than usual—can you help me understand why and what can be done about it?",
+            response="I've checked your billing history and found that your invoice increased due to a plan upgrade last month. According to our billing policy, you can request a review within 30 days.",
+            tool_calls=[
+                {"name": "get_customer_detail", "args": {"customer_id": 1001}, "result": {}},
+                {"name": "get_billing_summary", "args": {"customer_id": 1001}, "result": {}},
+                {"name": "search_knowledge_base", "args": {"query": "billing policy"}, "result": {}}
+            ],
+            metadata={"agent_type": "single_agent", "duration_ms": 2500}
+        )
+    ]
+    
+    # Run evaluation
+    runner = AgentEvaluationRunner(dataset_path="eval_dataset.json")
+    summary = runner.run_evaluation(example_traces, output_dir="eval_results")
+    
+    print("\n" + "=" * 80)
+    print("EVALUATION SUMMARY")
+    print("=" * 80)
+    print(json.dumps(summary, indent=2))
+
+
+if __name__ == "__main__":
+    example_usage()
diff --git a/agentic_ai/evaluations/metrics.py b/agentic_ai/evaluations/metrics.py
new file mode 100644
index 000000000..2820ed96d
--- /dev/null
+++ b/agentic_ai/evaluations/metrics.py
@@ -0,0 +1,1106 @@
+"""
+Evaluation metrics for AI Agent performance assessment.
+Pattern-agnostic metrics that work across:
+- single agents
+- handoff agents
+- reflection agents
+- research/magentic agents
+
+Includes Azure AI Foundry evaluators for LLM-as-judge evaluation.
+"""
+
+import os
+from typing import Dict, List, Any, Optional
+from dataclasses import dataclass
+from enum import Enum
+
+# Azure AI Foundry Evaluators (optional - graceful degradation if not available)
+try:
+    from azure.ai.evaluation import (
+        IntentResolutionEvaluator,
+        TaskAdherenceEvaluator,
+        ToolCallAccuracyEvaluator,
+        CoherenceEvaluator,
+        FluencyEvaluator,
+        RelevanceEvaluator,
+    )
+    AZURE_EVALUATORS_AVAILABLE = True
+except ImportError:
+    AZURE_EVALUATORS_AVAILABLE = False
+    IntentResolutionEvaluator = None
+    TaskAdherenceEvaluator = None
+    ToolCallAccuracyEvaluator = None
+    CoherenceEvaluator = None
+    FluencyEvaluator = None
+    RelevanceEvaluator = None
+
+
+# =========================
+# Metric Types
+# =========================
+
+class MetricType(Enum):
+    TOOL_BEHAVIOR = "tool_behavior"
+    RESPONSE_QUALITY = "response_quality"
+    ACCURACY = "accuracy"
+    COMPLETENESS = "completeness"
+    EFFICIENCY = "efficiency"
+    SAFETY = "safety"
+    INTENT = "intent"
+    COHERENCE = "coherence"
+    FLUENCY = "fluency"
+    RELEVANCE = "relevance"
+    SOLUTION_ACCURACY = "solution_accuracy"
+    TASK_COMPLETION = "task_completion"
+
+
+# =========================
+# Result Container
+# =========================
+
+@dataclass
+class EvaluationResult:
+    metric_name: str
+    metric_type: MetricType
+    score: float  # 1.0 – 5.0 scale (matching Foundry portal)
+    passed: bool
+    details: Dict[str, Any]
+    explanation: str
+
+
+# =========================
+# Tool Behavior Evaluator (Upgraded)
+# =========================
+
+class ToolBehaviorEvaluator:
+    """
+    Pattern-agnostic tool scoring:
+    - recall (required tools used)
+    - precision (relevant vs total)
+    - efficiency (minimal sufficiency)
+    """
+
+    def evaluate(
+        self,
+        expected_tools: List[str],
+        actual_tools: List[str],
+        required_tools: Optional[List[str]] = None,
+    ) -> EvaluationResult:
+
+        required_tools = required_tools or expected_tools
+
+        actual_set = set(actual_tools)
+        expected_set = set(expected_tools)
+        required_set = set(required_tools)
+
+        required_hit = required_set & actual_set
+        missing_required = required_set - actual_set
+        extra_tools = actual_set - expected_set
+        relevant_used = actual_set & expected_set
+
+        # --- Scores ---
+
+        recall = len(required_hit) / len(required_set) if required_set else 1.0
+        precision = len(relevant_used) / len(actual_set) if actual_set else 1.0
+        efficiency = len(required_set) / len(actual_set) if actual_set else 1.0
+        efficiency = min(efficiency, 1.0)
+
+        # Combined ratio (0-1), then scale to 1-5
+        combined = (recall * 0.5) + (precision * 0.3) + (efficiency * 0.2)
+        score = 1.0 + (combined * 4.0)  # Maps 0-1 to 1-5 scale
+
+        passed = recall == 1.0  # All required tools must be used
+
+        details = {
+            "recall": recall,
+            "precision": precision,
+            "efficiency": efficiency,
+            "missing_required": list(missing_required),
+            "extra_tools": list(extra_tools),
+            "required_hit": list(required_hit),
+        }
+
+        explanation = (
+            f"Recall={recall:.2f} Precision={precision:.2f} "
+            f"Efficiency={efficiency:.2f} Score={score:.1f}/5"
+        )
+
+        return EvaluationResult(
+            metric_name="tool_behavior",
+            metric_type=MetricType.TOOL_BEHAVIOR,
+            score=score,
+            passed=passed,
+            details=details,
+            explanation=explanation,
+        )
+
+
+# =========================
+# Completeness Evaluator (Hybrid)
+# =========================
+
+class CompletenessEvaluator:
+    """
+    Deterministic tool checks + optional LLM semantic checks.
+    """
+
+    TOOL_CRITERIA_MAP = {
+        "must_access_billing": ["get_billing_summary", "get_subscription_detail"],
+        "must_check_subscription": ["get_subscription_detail"],
+        "must_check_security_logs": ["get_security_logs"],
+        "must_check_promotions": ["get_eligible_promotions"],
+        "must_check_orders": ["get_customer_orders"],
+    }
+
+    def evaluate(
+        self,
+        success_criteria: Dict[str, bool],
+        tool_calls: List[Dict[str, Any]],
+    ) -> EvaluationResult:
+
+        tool_names = [c.get("name", "") for c in tool_calls]
+        results = {}
+
+        for criterion, required in success_criteria.items():
+
+            if not required:
+                results[criterion] = True
+                continue
+
+            if criterion in self.TOOL_CRITERIA_MAP:
+                needed = self.TOOL_CRITERIA_MAP[criterion]
+                results[criterion] = any(t in tool_names for t in needed)
+            else:
+                # semantic criteria handled by LLM judge metric
+                results[criterion] = True
+
+        required_count = sum(success_criteria.values())
+        met_count = sum(
+            1 for k, v in results.items()
+            if v and success_criteria.get(k)
+        )
+
+        # Scale to 1-5 (0 if no requirements, otherwise proportional)
+        ratio = met_count / required_count if required_count else 1.0
+        score = 1.0 + (ratio * 4.0)  # Maps 0-1 ratio to 1-5 scale
+        passed = met_count == required_count  # All requirements must be met
+
+        return EvaluationResult(
+            metric_name="completeness",
+            metric_type=MetricType.COMPLETENESS,
+            score=score,
+            passed=passed,
+            details=results,
+            explanation=f"{met_count}/{required_count} required criteria met",
+        )
+
+
+# =========================
+# Efficiency Evaluator (NEW)
+# =========================
+
+class EfficiencyEvaluator:
+    """
+    Pattern-agnostic step efficiency metric.
+    """
+
+    def evaluate(
+        self,
+        actual_tool_calls: int,
+        required_tools: int,
+    ) -> EvaluationResult:
+
+        baseline = max(required_tools, 1)
+        efficiency = baseline / max(actual_tool_calls, 1)
+        efficiency = min(efficiency, 1.0)
+        
+        # Scale to 1-5
+        score = 1.0 + (efficiency * 4.0)
+
+        return EvaluationResult(
+            metric_name="step_efficiency",
+            metric_type=MetricType.EFFICIENCY,
+            score=score,
+            passed=score >= 3.0,  # Threshold: 3/5
+            details={
+                "actual_calls": actual_tool_calls,
+                "baseline_required": baseline,
+            },
+            explanation=f"Efficiency {score:.1f}/5",
+        )
+
+
+# =========================
+# LLM Judge Evaluator (Upgraded)
+# =========================
+
+class ResponseQualityEvaluator:
+
+    def __init__(self, llm_client=None):
+        self.client = llm_client
+
+    def evaluate(
+        self,
+        query: str,
+        response: str,
+        tool_summary: Optional[str] = None,
+    ) -> EvaluationResult:
+
+        if not self.client:
+            return self._basic(response)
+
+        prompt = f"""
+Evaluate this customer support response.
+
+Query: {query}
+Response: {response}
+Tool Evidence: {tool_summary}
+
+Score 0–10:
+- relevance
+- clarity
+- completeness
+- professionalism
+- actionability
+- groundedness (uses evidence, not guesses)
+- safety (no over-promising)
+
+Return JSON with overall_score and explanation.
+"""
+
+        try:
+            r = self.client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {"role": "system", "content": "Expert evaluator."},
+                    {"role": "user", "content": prompt},
+                ],
+                response_format={"type": "json_object"},
+            )
+
+            import json
+            data = json.loads(r.choices[0].message.content)
+
+            # Convert 0-10 to 1-5 scale
+            raw_score = data["overall_score"]
+            score = 1.0 + (raw_score / 10.0) * 4.0  # Maps 0-10 to 1-5
+
+            return EvaluationResult(
+                metric_name="response_quality",
+                metric_type=MetricType.RESPONSE_QUALITY,
+                score=score,
+                passed=score >= 3.0,  # Threshold: 3/5
+                details=data,
+                explanation=data.get("explanation", ""),
+            )
+
+        except Exception:
+            return self._basic(response)
+
+    def _basic(self, response: str) -> EvaluationResult:
+        ok = len(response.split()) > 15
+        score = 5.0 if ok else 1.0  # 5/5 for good, 1/5 for bad
+        return EvaluationResult(
+            metric_name="response_quality_basic",
+            metric_type=MetricType.RESPONSE_QUALITY,
+            score=score,
+            passed=ok,
+            details={},
+            explanation="Basic length check",
+        )
+
+
+# =========================
+# Grounded Accuracy Evaluator (NEW)
+# =========================
+
+class GroundedAccuracyEvaluator:
+    """
+    Checks if response contradicts tool outputs (LLM-assisted).
+    """
+
+    def __init__(self, llm_client=None):
+        self.client = llm_client
+
+    def evaluate(
+        self,
+        response: str,
+        tool_outputs: Optional[str],
+    ) -> EvaluationResult:
+
+        if not self.client or not tool_outputs:
+            return EvaluationResult(
+                metric_name="grounded_accuracy",
+                metric_type=MetricType.ACCURACY,
+                score=5.0,  # Default pass on 1-5 scale
+                passed=True,
+                details={},
+                explanation="No grounding check available",
+            )
+
+        prompt = f"""
+Tool facts:
+{tool_outputs}
+
+Response:
+{response}
+
+Does the response contradict the tool facts?
+Answer JSON: {{ "contradiction": true/false }}
+"""
+
+        try:
+            r = self.client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[{"role": "user", "content": prompt}],
+                response_format={"type": "json_object"},
+            )
+
+            import json
+            data = json.loads(r.choices[0].message.content)
+            contradiction = data.get("contradiction", False)
+
+            # 1-5 scale: 5 for grounded, 1 for contradiction
+            score = 1.0 if contradiction else 5.0
+
+            return EvaluationResult(
+                metric_name="grounded_accuracy",
+                metric_type=MetricType.ACCURACY,
+                score=score,
+                passed=not contradiction,
+                details=data,
+                explanation="Contradiction detected" if contradiction else "Grounded",
+            )
+
+        except Exception:
+            return EvaluationResult(
+                metric_name="grounded_accuracy",
+                metric_type=MetricType.ACCURACY,
+                score=5.0,  # Default pass on 1-5 scale
+                passed=True,
+                details={},
+                explanation="Grounding check failed → default pass",
+            )
+
+
+# =========================
+# Safety / Overreach Evaluator (NEW)
+# =========================
+
+class SafetyEvaluator:
+
+    RISKY_PATTERNS = [
+        "guarantee refund",
+        "will definitely refund",
+        "account unlocked now",
+        "I have removed the charge",
+    ]
+
+    def evaluate(self, response: str) -> EvaluationResult:
+
+        lower = response.lower()
+        hits = [p for p in self.RISKY_PATTERNS if p in lower]
+
+        safe = len(hits) == 0
+        score = 5.0 if safe else 1.0  # 5/5 for safe, 1/5 for risky
+
+        return EvaluationResult(
+            metric_name="safety",
+            metric_type=MetricType.SAFETY,
+            score=score,
+            passed=safe,
+            details={"matches": hits},
+            explanation="No overreach" if safe else "Potential overreach detected",
+        )
+
+
+# =========================
+# Azure AI Foundry Evaluators (LLM-as-Judge)
+# =========================
+
+def _safe_float(value: Any, default: float = 0.0) -> float:
+    """Safely convert Azure SDK output to float."""
+    if value is None:
+        return default
+    if isinstance(value, (int, float)):
+        return float(value)
+    if isinstance(value, str):
+        try:
+            return float(value)
+        except ValueError:
+            return default
+    return default
+
+
+class AzureAIEvaluatorSuite:
+    """
+    Wrapper for Azure AI Foundry evaluation SDK evaluators.
+    
+    Provides LLM-as-judge evaluation for:
+    - Intent Resolution: Did agent correctly identify user intent?
+    - Task Adherence: Did response follow the task/system prompt?
+    - Tool Call Accuracy: Did agent use tools correctly with right parameters?
+    - Coherence: Is response logically coherent?
+    - Fluency: Is response well-written?
+    - Relevance: Is response relevant to query?
+    - Solution Accuracy: Does response match ground truth?
+    """
+
+    # Tool definitions for Contoso MCP server
+    # These are used by ToolCallAccuracyEvaluator to validate tool usage
+    CONTOSO_TOOL_DEFINITIONS = [
+        {
+            "name": "get_all_customers",
+            "description": "List all customers with basic info",
+            "parameters": {"type": "object", "properties": {}}
+        },
+        {
+            "name": "get_customer_detail",
+            "description": "Get a full customer profile including their subscriptions",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "customer_id": {"type": "integer", "description": "Customer identifier value"}
+                },
+                "required": ["customer_id"]
+            }
+        },
+        {
+            "name": "get_subscription_detail",
+            "description": "Detailed subscription view with invoices (with payments) and service incidents",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "subscription_id": {"type": "integer", "description": "Subscription identifier value"}
+                },
+                "required": ["subscription_id"]
+            }
+        },
+        {
+            "name": "get_invoice_payments",
+            "description": "Return invoice-level payments list",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "invoice_id": {"type": "integer", "description": "Invoice identifier value"}
+                },
+                "required": ["invoice_id"]
+            }
+        },
+        {
+            "name": "pay_invoice",
+            "description": "Record a payment for a given invoice and get new outstanding balance",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "invoice_id": {"type": "integer", "description": "Invoice identifier value"},
+                    "amount": {"type": "number", "description": "Payment amount"},
+                    "method": {"type": "string", "description": "Payment method"}
+                },
+                "required": ["invoice_id", "amount"]
+            }
+        },
+        {
+            "name": "get_data_usage",
+            "description": "Daily data-usage records for a subscription over a date range",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "subscription_id": {"type": "integer", "description": "Subscription identifier value"},
+                    "start_date": {"type": "string", "description": "Inclusive start date (YYYY-MM-DD)"},
+                    "end_date": {"type": "string", "description": "Inclusive end date (YYYY-MM-DD)"},
+                    "aggregate": {"type": "boolean", "description": "Set to true for aggregate statistics"}
+                },
+                "required": ["subscription_id", "start_date", "end_date"]
+            }
+        },
+        {
+            "name": "get_billing_summary",
+            "description": "Billing summary for a customer including outstanding balance and payment history",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "customer_id": {"type": "integer", "description": "Customer identifier value"}
+                },
+                "required": ["customer_id"]
+            }
+        },
+        {
+            "name": "get_security_logs",
+            "description": "Security events for a customer (newest first)",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "customer_id": {"type": "integer", "description": "Customer identifier value"}
+                },
+                "required": ["customer_id"]
+            }
+        },
+        {
+            "name": "unlock_account",
+            "description": "Unlock a locked customer account after security verification",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "customer_id": {"type": "integer", "description": "Customer identifier value"}
+                },
+                "required": ["customer_id"]
+            }
+        },
+        {
+            "name": "get_products",
+            "description": "List / search available products",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "category": {"type": "string", "description": "Filter by category"}
+                }
+            }
+        },
+        {
+            "name": "get_product_detail",
+            "description": "Return a single product by ID",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "product_id": {"type": "integer", "description": "Product identifier value"}
+                },
+                "required": ["product_id"]
+            }
+        },
+        {
+            "name": "get_promotions",
+            "description": "List every active promotion",
+            "parameters": {"type": "object", "properties": {}}
+        },
+        {
+            "name": "get_eligible_promotions",
+            "description": "Promotions eligible for a given customer right now (evaluates basic loyalty/date criteria)",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "customer_id": {"type": "integer", "description": "Customer identifier value"}
+                },
+                "required": ["customer_id"]
+            }
+        },
+        {
+            "name": "get_support_tickets",
+            "description": "Retrieve support tickets for a customer",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "customer_id": {"type": "integer", "description": "Customer identifier value"},
+                    "open_only": {"type": "boolean", "description": "Filter to open tickets only"}
+                },
+                "required": ["customer_id"]
+            }
+        },
+        {
+            "name": "create_support_ticket",
+            "description": "Create a new support ticket for a customer",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "customer_id": {"type": "integer", "description": "Customer identifier value"},
+                    "subject": {"type": "string", "description": "Ticket subject"},
+                    "description": {"type": "string", "description": "Detailed description of the issue"},
+                    "priority": {"type": "string", "description": "Priority level (low, medium, high)"}
+                },
+                "required": ["customer_id", "subject", "description"]
+            }
+        },
+        {
+            "name": "get_customer_orders",
+            "description": "Get orders for a customer",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "customer_id": {"type": "integer", "description": "Customer identifier value"}
+                },
+                "required": ["customer_id"]
+            }
+        },
+        {
+            "name": "search_knowledge_base",
+            "description": "Search the knowledge base for relevant articles",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "query": {"type": "string", "description": "Search query"}
+                },
+                "required": ["query"]
+            }
+        }
+    ]
+
+    def __init__(self, model_config: Optional[Dict[str, Any]] = None):
+        """
+        Initialize Azure AI evaluators.
+        
+        Args:
+            model_config: Azure OpenAI configuration dict with:
+                - azure_endpoint: Azure OpenAI endpoint URL
+                - api_key: API key (optional if using DefaultAzureCredential)
+                - azure_deployment: Model deployment name
+                - api_version: API version
+        """
+        self.available = AZURE_EVALUATORS_AVAILABLE
+        self._evaluators_initialized = False
+        
+        if not self.available:
+            print("[WARN] Azure AI Evaluation SDK not available - using fallback metrics")
+            return
+        
+        # Build model config from environment if not provided
+        if model_config is None:
+            # Use separate eval deployment if configured (for model compatibility)
+            eval_deployment = os.getenv("AZURE_OPENAI_EVAL_DEPLOYMENT") or os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT", "gpt-4o-mini")
+            model_config = {
+                "azure_endpoint": os.getenv("AZURE_OPENAI_ENDPOINT"),
+                "api_key": os.getenv("AZURE_OPENAI_API_KEY"),
+                "azure_deployment": eval_deployment,
+                "api_version": os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview"),
+            }
+        
+        if not model_config.get("azure_endpoint"):
+            print("[WARN] AZURE_OPENAI_ENDPOINT not set - Azure evaluators disabled")
+            self.available = False
+            return
+        
+        try:
+            # Initialize all evaluators
+            self._intent_evaluator = IntentResolutionEvaluator(model_config=model_config)
+            self._coherence_evaluator = CoherenceEvaluator(model_config=model_config)
+            self._fluency_evaluator = FluencyEvaluator(model_config=model_config)
+            self._relevance_evaluator = RelevanceEvaluator(model_config=model_config)
+            self._tool_call_accuracy_evaluator = ToolCallAccuracyEvaluator(model_config=model_config)
+            self._task_adherence_evaluator = TaskAdherenceEvaluator(model_config=model_config)
+            self._evaluators_initialized = True
+            print("[OK] Initialized Azure AI Foundry evaluators (including ToolCallAccuracyEvaluator, TaskAdherenceEvaluator)")
+        except Exception as e:
+            print(f"[WARN] Failed to initialize Azure evaluators: {e}")
+            self.available = False
+
+    def evaluate_intent(self, query: str, response: str) -> EvaluationResult:
+        """Evaluate if agent correctly identified user intent."""
+        if not self.available or not self._evaluators_initialized:
+            return self._fallback_result("intent_resolution", MetricType.INTENT)
+        
+        try:
+            result = self._intent_evaluator(query=query, response=response)
+            score = _safe_float(result.get("intent_resolution", 0))  # Keep 1-5 scale
+            return EvaluationResult(
+                metric_name="intent_resolution",
+                metric_type=MetricType.INTENT,
+                score=score,
+                passed=score >= 3.0,  # Threshold: 3/5
+                details=result,
+                explanation=result.get("intent_resolution_reason", ""),
+            )
+        except Exception as e:
+            return self._fallback_result("intent_resolution", MetricType.INTENT, str(e))
+
+    def evaluate_coherence(self, query: str, response: str) -> EvaluationResult:
+        """Evaluate response logical coherence."""
+        if not self.available or not self._evaluators_initialized:
+            return self._fallback_result("coherence", MetricType.COHERENCE)
+        
+        try:
+            result = self._coherence_evaluator(query=query, response=response)
+            score = _safe_float(result.get("coherence", 0))  # Keep 1-5 scale
+            return EvaluationResult(
+                metric_name="coherence",
+                metric_type=MetricType.COHERENCE,
+                score=score,
+                passed=score >= 3.0,  # Threshold: 3/5
+                details=result,
+                explanation=result.get("coherence_reason", ""),
+            )
+        except Exception as e:
+            return self._fallback_result("coherence", MetricType.COHERENCE, str(e))
+
+    def evaluate_fluency(self, query: str, response: str) -> EvaluationResult:
+        """Evaluate response writing quality."""
+        if not self.available or not self._evaluators_initialized:
+            return self._fallback_result("fluency", MetricType.FLUENCY)
+        
+        try:
+            result = self._fluency_evaluator(query=query, response=response)
+            score = _safe_float(result.get("fluency", 0))  # Keep 1-5 scale
+            return EvaluationResult(
+                metric_name="fluency",
+                metric_type=MetricType.FLUENCY,
+                score=score,
+                passed=score >= 3.0,  # Threshold: 3/5
+                details=result,
+                explanation=result.get("fluency_reason", ""),
+            )
+        except Exception as e:
+            return self._fallback_result("fluency", MetricType.FLUENCY, str(e))
+
+    def evaluate_relevance(self, query: str, response: str) -> EvaluationResult:
+        """Evaluate response relevance to query."""
+        if not self.available or not self._evaluators_initialized:
+            return self._fallback_result("relevance", MetricType.RELEVANCE)
+        
+        try:
+            result = self._relevance_evaluator(query=query, response=response)
+            score = _safe_float(result.get("relevance", 0))  # Keep 1-5 scale
+            return EvaluationResult(
+                metric_name="relevance",
+                metric_type=MetricType.RELEVANCE,
+                score=score,
+                passed=score >= 3.0,  # Threshold: 3/5
+                details=result,
+                explanation=result.get("relevance_reason", ""),
+            )
+        except Exception as e:
+            return self._fallback_result("relevance", MetricType.RELEVANCE, str(e))
+
+    def evaluate_tool_call_accuracy(
+        self,
+        query: str,
+        response: str,
+        tool_calls: List[Dict[str, Any]],
+        tool_definitions: Optional[List[Dict[str, Any]]] = None,
+    ) -> EvaluationResult:
+        """
+        Evaluate if agent used tools correctly with proper parameters.
+        
+        Uses Azure AI ToolCallAccuracyEvaluator to assess:
+        - Relevance of tool calls to the conversation
+        - Parameter correctness according to tool definitions
+        - Parameter value extraction from conversation context
+        
+        Scoring rubric (1-5):
+        - 5: Tool calls relevant, all parameters correctly passed
+        - 4: Relevant, but retried on errors and succeeded
+        - 3: Relevant, but unnecessary/excessive tool calls made
+        - 2: Partially relevant, not enough tools or incorrect params
+        - 1: Tool calls are irrelevant
+        
+        Args:
+            query: User query
+            response: Agent response
+            tool_calls: List of tool calls made by agent, each with:
+                - name: tool name
+                - args/arguments: parameters passed
+            tool_definitions: Tool schemas (defaults to CONTOSO_TOOL_DEFINITIONS)
+        """
+        if not self.available or not self._evaluators_initialized:
+            return self._fallback_result("tool_call_accuracy", MetricType.TOOL_BEHAVIOR)
+        
+        if not tool_calls:
+            # No tool calls to evaluate - return neutral score
+            return EvaluationResult(
+                metric_name="tool_call_accuracy",
+                metric_type=MetricType.TOOL_BEHAVIOR,
+                score=3.0,  # Neutral on 1-5 scale
+                passed=True,
+                details={"reason": "No tool calls made"},
+                explanation="No tool calls to evaluate",
+            )
+        
+        # Use default Contoso tool definitions if not provided
+        if tool_definitions is None:
+            tool_definitions = self.CONTOSO_TOOL_DEFINITIONS
+        
+        try:
+            # Format tool_calls for the evaluator
+            # Expected format: list of tool call objects with type, name, arguments
+            formatted_tool_calls = []
+            for i, tc in enumerate(tool_calls):
+                tool_name = tc.get("name", tc.get("function", {}).get("name", "unknown"))
+                tool_args = tc.get("args", tc.get("arguments", tc.get("function", {}).get("arguments", {})))
+                
+                # Ensure args is a dict
+                if isinstance(tool_args, str):
+                    import json
+                    try:
+                        tool_args = json.loads(tool_args)
+                    except json.JSONDecodeError:
+                        tool_args = {}
+                
+                formatted_tool_calls.append({
+                    "type": "tool_call",
+                    "tool_call_id": tc.get("id", f"call_{i}"),
+                    "name": tool_name,
+                    "arguments": tool_args,
+                })
+            
+            # Call the evaluator
+            result = self._tool_call_accuracy_evaluator(
+                query=query,
+                response=response,
+                tool_calls=formatted_tool_calls,
+                tool_definitions=tool_definitions,
+            )
+            
+            # Extract score (1-5 scale, keep as-is for portal parity)
+            score = _safe_float(result.get("tool_call_accuracy", 3))
+            
+            # Score 3+ is considered passing (threshold: 3/5)
+            passed = score >= 3.0
+            
+            return EvaluationResult(
+                metric_name="tool_call_accuracy",
+                metric_type=MetricType.TOOL_BEHAVIOR,
+                score=score,
+                passed=passed,
+                details={
+                    "tool_call_accuracy_details": result.get("tool_call_accuracy_details", {}),
+                    "tool_calls_evaluated": len(formatted_tool_calls),
+                    **result,
+                },
+                explanation=result.get("tool_call_accuracy_reason", f"Score: {score}/5"),
+            )
+        except Exception as e:
+            return self._fallback_result("tool_call_accuracy", MetricType.TOOL_BEHAVIOR, str(e))
+
+    def evaluate_task_adherence(
+        self,
+        query: str,
+        response: str,
+        tool_calls: Optional[List[Dict[str, Any]]] = None,
+        task_description: Optional[str] = None,
+    ) -> EvaluationResult:
+        """
+        Evaluate whether the agent adheres to the assigned task and follows expected procedures.
+        
+        TaskAdherenceEvaluator checks:
+        - Did the agent address the user's goal?
+        - Did it follow proper procedures/steps?
+        - Did it avoid going off-topic or performing unauthorized actions?
+        
+        This is COMPLEMENTARY to solution_accuracy:
+        - solution_accuracy: Compares response to ground truth (1-5 rubric score)
+        - task_adherence: Checks procedural/behavioral compliance (pass/fail)
+        
+        Args:
+            query: User query
+            response: Agent response
+            tool_calls: List of tool calls made (to show what actions were taken)
+            task_description: Optional task description (defaults to Contoso agent role)
+        """
+        if not self._task_adherence_evaluator:
+            return self._fallback_result("task_adherence", MetricType.TASK_COMPLETION)
+        
+        # Default task description for Contoso customer service
+        if task_description is None:
+            task_description = """You are a customer service agent for Contoso Telecom.
+Your task is to help customers with:
+- Billing inquiries and payment processing
+- Subscription management and data usage
+- Technical support and troubleshooting
+- Account security and fraud detection
+- Product and promotion information
+
+You must:
+- Only access customer data when the customer provides their customer ID
+- Provide accurate information based on the customer's actual account data
+- Never make up or hallucinate information
+- Follow proper procedures for sensitive operations like payments
+- Be helpful, professional, and empathetic"""
+        
+        try:
+            import json
+            
+            # Format the conversation as agent messages
+            # TaskAdherenceEvaluator expects a conversation-style format
+            query_messages = [{"role": "user", "content": query}]
+            
+            # Build response messages including tool calls if any
+            response_messages = []
+            
+            # If tool calls were made, include them in the assistant's actions
+            if tool_calls:
+                for tc in tool_calls:
+                    tool_name = tc.get("name", tc.get("function", {}).get("name", "unknown"))
+                    tool_args = tc.get("args", tc.get("arguments", tc.get("function", {}).get("arguments", {})))
+                    
+                    if isinstance(tool_args, str):
+                        try:
+                            tool_args = json.loads(tool_args)
+                        except json.JSONDecodeError:
+                            tool_args = {}
+                    
+                    response_messages.append({
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [{
+                            "id": tc.get("id", f"call_{tool_name}"),
+                            "type": "function",
+                            "function": {
+                                "name": tool_name,
+                                "arguments": json.dumps(tool_args) if isinstance(tool_args, dict) else str(tool_args),
+                            }
+                        }]
+                    })
+            
+            # Add final response
+            response_messages.append({"role": "assistant", "content": response})
+            
+            # Call the evaluator
+            result = self._task_adherence_evaluator(
+                query=query_messages,
+                response=response_messages,
+                task=task_description,
+            )
+            
+            # TaskAdherenceEvaluator returns a numeric score
+            # Keep 1-5 scale for portal parity (0 for failures)
+            raw_score = result.get("task_adherence", 0)
+            
+            # Handle boolean or numeric
+            if isinstance(raw_score, bool):
+                score = 5.0 if raw_score else 0.0
+            else:
+                score = _safe_float(raw_score)
+            
+            # Threshold: score >= 3 is passing
+            passed = score >= 3.0
+            
+            return EvaluationResult(
+                metric_name="task_adherence",
+                metric_type=MetricType.TASK_COMPLETION,
+                score=score,
+                passed=passed,
+                details={
+                    "raw_result": result,
+                    "tool_calls_count": len(tool_calls) if tool_calls else 0,
+                    "task_description_length": len(task_description),
+                },
+                explanation=result.get("task_adherence_reason", f"Task adherence: {score}/5"),
+            )
+        except Exception as e:
+            return self._fallback_result("task_adherence", MetricType.TASK_COMPLETION, str(e))
+
+    def evaluate_solution_accuracy(
+        self,
+        query: str,
+        response: str,
+        ground_truth: str,
+        scoring_rubric: str,
+        llm_client=None,
+    ) -> EvaluationResult:
+        """
+        Evaluate solution accuracy against ground truth using scoring rubric.
+        
+        This is a custom evaluator that uses LLM to compare the agent's response
+        against the expected solution using the provided rubric.
+        """
+        if not llm_client and not self.available:
+            return self._fallback_result("solution_accuracy", MetricType.SOLUTION_ACCURACY)
+        
+        prompt = f"""You are evaluating a customer service agent's response.
+
+USER QUERY:
+{query}
+
+AGENT RESPONSE:
+{response}
+
+EXPECTED SOLUTION (Ground Truth):
+{ground_truth}
+
+SCORING RUBRIC:
+{scoring_rubric}
+
+Based on the rubric, score the agent's response from 1-5.
+Return JSON: {{"score": <1-5>, "reason": "<brief explanation>"}}
+"""
+        
+        try:
+            # Use provided client or create one from environment
+            if llm_client:
+                client = llm_client
+            else:
+                from openai import AzureOpenAI
+                client = AzureOpenAI(
+                    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
+                    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
+                    api_version=os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview"),
+                )
+            
+            result = client.chat.completions.create(
+                model=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT", "gpt-4o-mini"),
+                messages=[{"role": "user", "content": prompt}],
+                response_format={"type": "json_object"},
+            )
+            
+            import json
+            data = json.loads(result.choices[0].message.content)
+            score = _safe_float(data.get("score", 3))  # Keep 1-5 scale
+            
+            return EvaluationResult(
+                metric_name="solution_accuracy",
+                metric_type=MetricType.SOLUTION_ACCURACY,
+                score=score,
+                passed=score >= 3.0,  # Threshold: 3/5
+                details=data,
+                explanation=data.get("reason", ""),
+            )
+        except Exception as e:
+            return self._fallback_result("solution_accuracy", MetricType.SOLUTION_ACCURACY, str(e))
+
+    def evaluate_all(
+        self,
+        query: str,
+        response: str,
+        ground_truth: Optional[str] = None,
+        scoring_rubric: Optional[str] = None,
+        tool_calls: Optional[List[Dict[str, Any]]] = None,
+        llm_client=None,
+    ) -> List[EvaluationResult]:
+        """Run all Azure AI evaluators and return list of results.
+        
+        Args:
+            query: User query
+            response: Agent response
+            ground_truth: Expected solution (optional)
+            scoring_rubric: Rubric for scoring (optional)
+            tool_calls: List of tool calls made by agent (optional)
+            llm_client: OpenAI client for solution accuracy (optional)
+        """
+        results = [
+            self.evaluate_intent(query, response),
+            self.evaluate_coherence(query, response),
+            self.evaluate_fluency(query, response),
+            self.evaluate_relevance(query, response),
+        ]
+        
+        # Add tool call accuracy if tool calls were made
+        if tool_calls:
+            results.append(
+                self.evaluate_tool_call_accuracy(query, response, tool_calls)
+            )
+            # Also evaluate task adherence (complementary to solution_accuracy)
+            results.append(
+                self.evaluate_task_adherence(query, response, tool_calls)
+            )
+        
+        if ground_truth and scoring_rubric:
+            results.append(
+                self.evaluate_solution_accuracy(
+                    query, response, ground_truth, scoring_rubric, llm_client
+                )
+            )
+        
+        return results
+
+    def _fallback_result(
+        self,
+        metric_name: str,
+        metric_type: MetricType,
+        error: str = "Evaluator not available",
+    ) -> EvaluationResult:
+        """Return a neutral fallback result when evaluator is unavailable."""
+        return EvaluationResult(
+            metric_name=metric_name,
+            metric_type=metric_type,
+            score=3.0,  # Neutral score on 1-5 scale
+            passed=True,
+            details={"error": error},
+            explanation=f"Fallback: {error}",
+        )
diff --git a/agentic_ai/evaluations/run_agent_eval.py b/agentic_ai/evaluations/run_agent_eval.py
new file mode 100644
index 000000000..232da713e
--- /dev/null
+++ b/agentic_ai/evaluations/run_agent_eval.py
@@ -0,0 +1,952 @@
+"""
+Run evaluation on the agent configured in .env file.
+
+This script:
+1. Reads AGENT_MODULE from .env (same as backend.py does)
+2. Loads that agent dynamically
+3. Runs all test cases from eval_dataset.json
+4. Captures traces and evaluates performance
+
+Usage:
+    cd agentic_ai/applications
+    uv run python ../evaluations/run_agent_eval.py
+
+Prerequisites:
+    - MCP server must be running (cd mcp && uv run python mcp_service.py)
+    - .env file must be configured in agentic_ai/applications/
+"""
+
+import os
+import sys
+import asyncio
+import json
+import warnings
+import logging
+import uuid
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List
+
+# Suppress async generator cleanup warnings from MCP client
+warnings.filterwarnings("ignore", message=".*async_generator.*")
+warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*cancel scope.*")
+
+# Add parent directory to Python path so we can import agents module
+current_dir = Path(__file__).parent
+parent_dir = current_dir.parent
+sys.path.insert(0, str(parent_dir))
+
+# Debug: Print the path that was added
+print(f"🔍 Added to Python path: {parent_dir}")
+print(f"🔍 Agents directory exists: {(parent_dir / 'agents').exists()}")
+
+# Note: No telemetry setup needed - using HTTP requests to backend with telemetry
+
+# Suppress asyncio error logs about async generator cleanup
+logging.getLogger('asyncio').setLevel(logging.CRITICAL)
+
+# Add project paths
+project_root = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(project_root))
+sys.path.insert(0, str(project_root / "applications"))
+
+# Load environment from applications/.env (or current directory .env)
+try:
+    from dotenv import load_dotenv
+    env_path = project_root / "applications" / ".env"
+    load_dotenv(env_path)
+except ImportError:
+    # dotenv not available, load manually
+    env_path = project_root / "applications" / ".env"
+    if env_path.exists():
+        with open(env_path) as f:
+            for line in f:
+                line = line.strip()
+                if line and not line.startswith('#') and '=' in line:
+                    key, value = line.split('=', 1)
+                    os.environ[key.strip()] = value.strip().strip('"')
+
+print("=" * 80)
+print("AI AGENT EVALUATION - Using Agent from .env")
+print("=" * 80)
+
+# Import evaluation framework
+from evaluations import AgentEvaluationRunner, AgentTrace
+
+# Import utilities
+from applications.utils import get_state_store
+
+
+class ToolCallTracker:
+    """Captures tool calls emitted via the agent's WebSocket-style broadcast.
+
+    This mirrors the lightweight tracker used in run_batch_eval.py: any
+    broadcast message with type == "tool_called" is recorded so that the
+    evaluator can score tool usage and completeness for Agent Framework
+    agents (including the handoff multi-domain pattern).
+    """
+
+    def __init__(self) -> None:
+        self.tool_calls: List[Dict[str, Any]] = []
+
+    async def broadcast(self, session_id: str, message: dict) -> None:
+        if isinstance(message, dict) and message.get("type") == "tool_called":
+            tool_name = message.get("tool_name")
+            if tool_name:
+                # Evaluator only needs the tool name; args/results are optional
+                self.tool_calls.append({"name": tool_name})
+
+
+async def run_agent_on_query(agent_instance, query: str, session_id: str) -> tuple[str, List[Dict[str, Any]]]:
+    """Run the agent on a single query and capture response + tool calls.
+
+    For Agent Framework agents (single, handoff, reflection, etc.), we inject a
+    ToolCallTracker via set_websocket_manager so that tool_called events emitted
+    during MCP tool invocations are captured for evaluation.
+    """
+    captured_tools: List[Dict[str, Any]] = []
+
+    # Inject tool-call tracker if the agent supports a WebSocket manager
+    tracker: ToolCallTracker | None = None
+    if hasattr(agent_instance, "set_websocket_manager"):
+        tracker = ToolCallTracker()
+        agent_instance.set_websocket_manager(tracker)
+
+    try:
+        # Run agent using the same methods as backend.py
+        if hasattr(agent_instance, "chat_async"):
+            # Agent Framework agents
+            result = await agent_instance.chat_async(query)
+            response_text = str(result) if result else "No response"
+
+        elif hasattr(agent_instance, "chat_stream"):
+            # Autogen streaming agents - collect full response
+            response_parts = []
+            async for event in agent_instance.chat_stream(query):
+                if hasattr(event, 'content'):
+                    response_parts.append(str(event.content))
+            response_text = " ".join(response_parts) if response_parts else "No response"
+
+        else:
+            # Fallback: try calling agent directly
+            result = await agent_instance(query)
+            response_text = str(result) if result else "No response"
+
+        # Prefer tools captured via tracker for Agent Framework agents
+        if tracker is not None and tracker.tool_calls:
+            captured_tools = tracker.tool_calls
+        else:
+            # Fallbacks for agents that expose tool calls directly
+            if hasattr(agent_instance, 'get_tool_calls'):
+                captured_tools = agent_instance.get_tool_calls()
+            elif hasattr(agent_instance, '_tool_calls'):
+                captured_tools = agent_instance._tool_calls  # type: ignore[attr-defined]
+            elif hasattr(agent_instance, 'tool_calls'):
+                captured_tools = agent_instance.tool_calls  # type: ignore[attr-defined]
+
+    except Exception as e:
+        print(f"  ⚠ Error running agent: {e}")
+        response_text = f"Error: {str(e)}"
+        captured_tools = []
+
+    return response_text, captured_tools
+
+
+def format_trace_as_agent_messages(trace: AgentTrace) -> tuple[list, list]:
+    """Convert an AgentTrace to the agent message schema expected by Foundry evaluators.
+    
+    Returns:
+        tuple: (query_messages, response_messages) in OpenAI-style agent message format
+    """
+    from datetime import datetime
+    
+    # Build query as list of messages (system + user query)
+    query_messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful customer service agent for Contoso."
+        },
+        {
+            "createdAt": datetime.utcnow().isoformat() + "Z",
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": trace.query
+                }
+            ]
+        }
+    ]
+    
+    # Build response as list of messages (including tool calls and final response)
+    response_messages = []
+    run_id = f"run_{hash(trace.query) % 100000:05d}"
+    
+    # Add tool calls if any
+    for i, tool_call in enumerate(trace.tool_calls):
+        tool_name = tool_call.get("name", "unknown_tool")
+        tool_args = tool_call.get("args", {})
+        tool_call_id = f"call_{hash(tool_name) % 100000:05d}_{i}"
+        
+        # Tool call message from assistant
+        response_messages.append({
+            "createdAt": datetime.utcnow().isoformat() + "Z",
+            "run_id": run_id,
+            "role": "assistant",
+            "content": [
+                {
+                    "type": "tool_call",
+                    "tool_call_id": tool_call_id,
+                    "name": tool_name,
+                    "arguments": tool_args if isinstance(tool_args, dict) else {}
+                }
+            ]
+        })
+        
+        # Tool result message
+        response_messages.append({
+            "createdAt": datetime.utcnow().isoformat() + "Z",
+            "run_id": run_id,
+            "tool_call_id": tool_call_id,
+            "role": "tool",
+            "content": [
+                {
+                    "type": "tool_result",
+                    "tool_result": tool_call.get("result", {"status": "success"})
+                }
+            ]
+        })
+    
+    # Final assistant response
+    response_messages.append({
+        "createdAt": datetime.utcnow().isoformat() + "Z",
+        "run_id": run_id,
+        "role": "assistant",
+        "content": [
+            {
+                "type": "text",
+                "text": trace.response
+            }
+        ]
+    })
+    
+    return query_messages, response_messages
+
+
+async def run_foundry_evaluation(traces: List[AgentTrace], data_file: Path, agent_name: str, test_cases: List[Dict[str, Any]] = None, eval_type: str = "mixed"):
+    """Run evaluation using Azure AI Projects SDK and log results to Foundry portal.
+    
+    Uses the openai_client.evals API (azure-ai-projects>=2.0.0b1) which works with
+    the new Foundry Project type (not requiring Foundry Hub).
+    
+    Args:
+        traces: List of agent traces with query/response pairs
+        data_file: Path to the JSONL data file
+        agent_name: Name of the agent for labeling
+        test_cases: Optional list of test cases with ground_truth for solution_accuracy
+        eval_type: Type of evaluation - "single-turn", "multi-turn", or "mixed"
+    """
+    import time
+    
+    try:
+        from azure.ai.projects import AIProjectClient
+        from azure.identity import DefaultAzureCredential
+    except ImportError as e:
+        print(f"❌ Azure AI Projects SDK not installed: {e}")
+        print("   Install with: uv add 'azure-ai-projects>=2.0.0b1' azure-identity")
+        return
+    
+    # Get project endpoint from environment
+    project_endpoint = os.environ.get("AZURE_AI_PROJECT_ENDPOINT")
+    
+    if not project_endpoint:
+        print("❌ Missing AZURE_AI_PROJECT_ENDPOINT in .env file")
+        print("   Get this from: Azure AI Foundry → Your Project → Home page")
+        print("   Example: https://eastus2.api.azureml.ms/api/projects/your-project-name")
+        return
+    
+    print(f"📤 Azure AI Project Endpoint: {project_endpoint}")
+    print(f"🏷️ Agent name: {agent_name}")
+    print(f"📊 Traces to evaluate: {len(traces)}")
+    
+    try:
+        # Connect to AI Project
+        credential = DefaultAzureCredential()
+        project_client = AIProjectClient(
+            endpoint=project_endpoint,
+            credential=credential,
+        )
+        
+        with project_client:
+            # Get OpenAI client from the project
+            openai_client = project_client.get_openai_client()
+            
+            # Check if the project has evals capability
+            if not hasattr(openai_client, 'evals'):
+                print("⚠️ This project doesn't support the evals API.")
+                print("   Make sure you have azure-ai-projects>=2.0.0b1 installed")
+                return
+            
+            # Define the evaluation schema for Azure AI built-in evaluators
+            # Note: tool_calls and tool_definitions removed due to Foundry API schema issues
+            data_source_config = {
+                "type": "custom",
+                "item_schema": {
+                    "type": "object",
+                    "properties": {
+                        "query": {"type": "string"},
+                        "response": {"type": "string"},
+                        "context": {"type": "string"},
+                        "ground_truth": {"type": "string"}
+                    },
+                    "required": ["query", "response"]
+                },
+                "include_sample_schema": True
+            }
+            
+            # Get the model deployment name for LLM-based evaluators
+            # First check for dedicated eval model, then fall back to chat deployment
+            model_deployment_name = os.getenv("AZURE_OPENAI_EVAL_DEPLOYMENT") or os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT", "gpt-4o-mini")
+            print(f"📋 Evaluation model: {model_deployment_name}")
+            
+            # Check if using a reasoning model (GPT-5 or higher, o1, o3 models)
+            # Reasoning models require different configuration (e.g., max_completion_tokens instead of max_tokens)
+            def is_reasoning_model(model_name: str) -> bool:
+                model_lower = model_name.lower()
+                # Check for o-series reasoning models
+                if model_lower.startswith(("o1", "o3", "o4")):
+                    return True
+                # Check for GPT-5 or higher
+                import re
+                gpt_match = re.search(r'gpt-?(\d+)', model_lower)
+                if gpt_match:
+                    version = int(gpt_match.group(1))
+                    return version >= 5
+                return False
+            
+            use_reasoning_model = is_reasoning_model(model_deployment_name)
+            
+            # Build initialization parameters - include is_reasoning_model for GPT-5+ and o-series models
+            def get_init_params() -> dict:
+                params = {"deployment_name": model_deployment_name}
+                if use_reasoning_model:
+                    params["is_reasoning_model"] = True
+                return params
+            
+            # Define testing criteria using Azure AI built-in evaluators
+            # These provide numeric scores (1-5 scale) with pass/fail labels and reasoning
+            # See: https://learn.microsoft.com/en-us/azure/ai-foundry/how-to/develop/agent-evaluate-sdk
+            testing_criteria = [
+                # Quality evaluators (5-point scale: 1=poor, 5=excellent)
+                # Score >= 3 is considered passing by default
+                {
+                    "type": "azure_ai_evaluator",
+                    "name": "coherence",
+                    "evaluator_name": "builtin.coherence",
+                    "initialization_parameters": get_init_params(),
+                    "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"}
+                },
+                {
+                    "type": "azure_ai_evaluator",
+                    "name": "fluency",
+                    "evaluator_name": "builtin.fluency",
+                    "initialization_parameters": get_init_params(),
+                    "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"}
+                },
+                {
+                    "type": "azure_ai_evaluator",
+                    "name": "relevance",
+                    "evaluator_name": "builtin.relevance",
+                    "initialization_parameters": get_init_params(),
+                    "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"}
+                },
+                {
+                    "type": "azure_ai_evaluator",
+                    "name": "groundedness",
+                    "evaluator_name": "builtin.groundedness",
+                    "initialization_parameters": get_init_params(),
+                    "data_mapping": {
+                        "query": "{{item.query}}",
+                        "response": "{{item.response}}",
+                        "context": "{{item.context}}"
+                    }
+                },
+                # Agent-specific evaluators
+                {
+                    "type": "azure_ai_evaluator",
+                    "name": "task_adherence",
+                    "evaluator_name": "builtin.task_adherence",
+                    "initialization_parameters": get_init_params(),
+                    "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"}
+                },
+                {
+                    "type": "azure_ai_evaluator",
+                    "name": "intent_resolution",
+                    "evaluator_name": "builtin.intent_resolution",
+                    "initialization_parameters": get_init_params(),
+                    "data_mapping": {
+                        "query": "{{item.query}}",
+                        "response": "{{item.response}}"
+                    }
+                },
+                # Custom solution accuracy evaluator (compares response to ground truth)
+                {
+                    "type": "label_model",
+                    "name": "solution_accuracy",
+                    "model": model_deployment_name,
+                    "input": [
+                        {"role": "system", "content": """You are an expert evaluator for customer service agent responses.
+Your task is to score how well the agent's response matches the expected solution.
+
+Scoring Rubric (1-5):
+5 - Excellent: Response fully addresses all aspects of the expected solution with accurate details
+4 - Good: Response addresses most aspects correctly with minor omissions
+3 - Adequate: Response addresses the main points but misses some details or has minor inaccuracies
+2 - Poor: Response partially addresses the query but misses key information or has significant issues
+1 - Very Poor: Response fails to address the query or contains major errors
+
+Return ONLY a JSON object with:
+- "choice": your score as a string ("1", "2", "3", "4", or "5")
+- "reason": brief explanation of your score"""},
+                        {"role": "user", "content": """Customer Query: {{item.query}}
+
+Agent Response: {{item.response}}
+
+Expected Solution (Ground Truth): {{item.ground_truth}}
+
+Evaluate how well the agent's response matches the expected solution. Consider:
+- Did the agent provide the correct information?
+- Did the agent address all aspects of the customer's question?
+- Is the response accurate based on the ground truth?"""}
+                    ],
+                    "passing_labels": ["5", "4", "3"],
+                    "labels": ["1", "2", "3", "4", "5"]
+                },
+            ]
+            
+            # Create the evaluation definition with descriptive name
+            eval_type_label = eval_type.replace("-", " ").title()  # "single-turn" -> "Single Turn"
+            print(f"\n🚀 Creating evaluation in Foundry...")
+            eval_obj = openai_client.evals.create(
+                name=f"{agent_name} - {eval_type_label}",
+                data_source_config=data_source_config,
+                testing_criteria=testing_criteria
+            )
+            print(f"✓ Evaluation created (id: {eval_obj.id})")
+            
+            # Build a lookup from test_id to test_case for ground_truth
+            test_case_lookup = {}
+            if test_cases:
+                for tc in test_cases:
+                    test_case_lookup[tc.get("id")] = tc
+            
+            # Note: Tool definitions removed from remote evaluation due to Foundry API schema issues
+            # Tool-related evaluation (tool_call_accuracy) is done locally via Azure AI Evaluation SDK
+            
+            # Prepare data items from traces
+            eval_items = []
+            for trace in traces:
+                test_id = trace.metadata.get("test_id") if trace.metadata else None
+                test_case = test_case_lookup.get(test_id, {}) if test_id else {}
+                ground_truth = test_case.get("ground_truth_solution", "No ground truth available")
+                
+                # Note: tool_calls and tool_definitions removed due to Foundry API schema issues
+                # Tool-related evaluation is done locally via Azure AI Evaluation SDK
+                
+                eval_items.append({
+                    "item": {
+                        "query": trace.query,
+                        "response": trace.response,
+                        "context": ground_truth,  # Used for groundedness
+                        "ground_truth": ground_truth
+                    }
+                })
+            
+            # Create run data source
+            data_source = {
+                "type": "jsonl",
+                "source": {
+                    "type": "file_content",
+                    "content": eval_items
+                }
+            }
+            
+            # Start the evaluation run
+            run = openai_client.evals.runs.create(
+                eval_id=eval_obj.id,
+                name=f"{agent_name} | {eval_type_label} | {datetime.now().strftime('%Y-%m-%d %H:%M')}",
+                data_source=data_source
+            )
+            print(f"✓ Evaluation run started (id: {run.id})")
+            
+            # Wait for completion
+            print("\n⏳ Waiting for evaluation to complete...")
+            while run.status not in ["completed", "failed", "cancelled"]:
+                time.sleep(3)
+                run = openai_client.evals.runs.retrieve(
+                    eval_id=eval_obj.id,
+                    run_id=run.id
+                )
+                print(f"   Status: {run.status}")
+            
+            # Display results
+            if run.status == "completed":
+                print("\n✅ Evaluation run completed successfully!")
+                
+                if hasattr(run, 'result_counts') and run.result_counts:
+                    rc = run.result_counts
+                    total = rc.total if hasattr(rc, 'total') else 0
+                    passed = rc.passed if hasattr(rc, 'passed') else 0
+                    failed = rc.failed if hasattr(rc, 'failed') else 0
+                    
+                    print(f"\n📊 Results:")
+                    print(f"   Total:  {total}")
+                    print(f"   Passed: {passed} ✓")
+                    print(f"   Failed: {failed} ✗")
+                    if total > 0:
+                        print(f"   Pass Rate: {passed/total:.1%}")
+                
+                # Fetch detailed output items to show numeric scores
+                try:
+                    output_items = list(openai_client.evals.runs.output_items.list(
+                        eval_id=eval_obj.id,
+                        run_id=run.id
+                    ))
+                    
+                    if output_items:
+                        print(f"\n📈 Detailed Scores by Evaluator (1-5 scale, threshold: 3):")
+                        print("-" * 70)
+                        
+                        # Aggregate scores by evaluator
+                        evaluator_scores: Dict[str, List[float]] = {}
+                        evaluator_details: Dict[str, List[Dict]] = {}
+                        
+                        for item in output_items:
+                            if hasattr(item, 'results') and item.results:
+                                for result in item.results:
+                                    name = getattr(result, 'name', 'unknown')
+                                    score = getattr(result, 'score', None)
+                                    label = getattr(result, 'label', None)
+                                    threshold = getattr(result, 'threshold', None)
+                                    reason = getattr(result, 'reason', None)
+                                    
+                                    if name not in evaluator_scores:
+                                        evaluator_scores[name] = []
+                                        evaluator_details[name] = []
+                                    
+                                    if score is not None:
+                                        evaluator_scores[name].append(score)
+                                        evaluator_details[name].append({
+                                            'score': score,
+                                            'label': label,
+                                            'threshold': threshold,
+                                            'reason': reason[:100] + '...' if reason and len(reason) > 100 else reason
+                                        })
+                        
+                        # Print aggregated scores - keep 1-5 scale for portal parity
+                        for evaluator_name, scores in sorted(evaluator_scores.items()):
+                            if scores:
+                                avg_score = sum(scores) / len(scores)
+                                
+                                # Determine pass/fail (threshold: 3/5)
+                                passed = avg_score >= 3.0
+                                status = "✓" if passed else "✗"
+                                
+                                # Create visual bar (scaled for 1-5 range)
+                                bar_length = int(avg_score * 4)  # Max 20 chars at score 5
+                                bar = "█" * bar_length
+                                
+                                print(f"   {evaluator_name:25} {avg_score:4.1f}/5 {bar:20} {status}")
+                        
+                        print("-" * 70)
+                        
+                except Exception as e:
+                    print(f"   (Could not fetch detailed scores: {e})")
+                
+                if hasattr(run, 'report_url') and run.report_url:
+                    print(f"\n🔗 View in Foundry portal:")
+                    print(f"   {run.report_url}")
+                else:
+                    print(f"\n🔗 View results in Azure AI Foundry portal:")
+                    print(f"   https://ai.azure.com")
+                    
+            else:
+                print(f"\n❌ Evaluation run failed: {run.status}")
+                if hasattr(run, 'error'):
+                    print(f"   Error: {run.error}")
+                    
+    except Exception as e:
+        print(f"❌ Error running Foundry evaluation: {e}")
+        import traceback
+        traceback.print_exc()
+        
+        print("\n💡 Troubleshooting tips:")
+        print("   1. Verify AZURE_AI_PROJECT_ENDPOINT is correct")
+        print("   2. Make sure you're signed in: az login")
+        print("   3. Check azure-ai-projects version: uv pip show azure-ai-projects")
+
+
+async def main():
+    """Main evaluation entry point."""
+    
+    # Parse command line arguments
+    import argparse
+    parser = argparse.ArgumentParser(description="Run agent evaluations")
+    parser.add_argument("--agent", default=None, 
+                        help="Agent type: single, reflection, handoff (overrides --agent-name)")
+    parser.add_argument("--agent-name", default="agent_eval", help="Name for telemetry tracking")
+    parser.add_argument("--backend-url", default="http://localhost:700", help="Backend URL to send requests to")
+    parser.add_argument("--remote", action="store_true", help="Run evaluation in Azure AI Foundry portal only (skip local)")
+    parser.add_argument("--local", action="store_true", help="Run local evaluation only (default if neither specified)")
+    parser.add_argument("--limit", type=int, default=0, help="Limit number of test cases to run (0 = all)")
+    parser.add_argument("--multi-turn-only", action="store_true", help="Only run multi-turn test cases")
+    parser.add_argument("--single-turn-only", action="store_true", help="Only run single-turn test cases")
+    args = parser.parse_args()
+    
+    # Determine agent name based on --agent flag
+    if args.agent:
+        agent_name = f"agent_{args.agent}"
+    else:
+        agent_name = args.agent_name
+    
+    backend_url = args.backend_url
+    
+    # Determine run mode: default to local if neither specified
+    run_local = args.local or not args.remote
+    run_remote = args.remote
+    
+    print(f"Using backend: {backend_url}")
+    print(f"Agent name: {agent_name}")
+    if run_remote and run_local:
+        print(f"Mode: Both Local + Remote (Azure AI Foundry)")
+    elif run_remote:
+        print(f"Mode: Remote only (Azure AI Foundry)")
+    else:
+        print(f"Mode: Local only")
+    if args.multi_turn_only:
+        print(f"Filter: Multi-turn only")
+    elif args.single_turn_only:
+        print(f"Filter: Single-turn only")
+    
+    # 1. No need to load agent module - we're sending HTTP requests
+    print(f"\n🌐 Using HTTP requests to backend instead of direct agent creation")
+    
+    # 2. Test backend connection
+    try:
+        import httpx
+        async with httpx.AsyncClient() as client:
+            health_response = await client.get(f"{backend_url}/health", timeout=5.0)
+            print(f"✓ Backend is responding")
+    except Exception as e:
+        print(f"❌ Cannot connect to backend: {e}")
+        print(f"   Make sure backend is running on {backend_url}")
+        return
+    
+    # 3. Check MCP server
+    mcp_uri = os.getenv("MCP_SERVER_URI", "http://localhost:8000/mcp")
+    print(f"\n🔌 MCP Server: {mcp_uri}")
+    
+    try:
+        import requests
+        health_check = requests.get(mcp_uri.replace("/mcp", "/health"), timeout=2)
+        print(f"✓ MCP server is responding")
+    except:
+        print(f"⚠ WARNING: Could not connect to MCP server")
+        print(f"   Make sure it's running: cd mcp && uv run python mcp_service.py")
+        response = input("\nContinue anyway? (y/n): ")
+        if response.lower() != 'y':
+            return
+    
+    # 4. Load test cases
+    dataset_path = Path(__file__).parent / "eval_dataset.json"
+    with open(dataset_path, encoding='utf-8') as f:
+        data = json.load(f)
+    test_cases = data["test_cases"]
+    
+    # Filter by multi-turn or single-turn
+    if args.multi_turn_only:
+        test_cases = [tc for tc in test_cases if tc.get("multi_turn", False)]
+        print(f"\n🔄 Filtering to multi-turn test cases only")
+    elif args.single_turn_only:
+        test_cases = [tc for tc in test_cases if not tc.get("multi_turn", False)]
+        print(f"\n📝 Filtering to single-turn test cases only")
+    
+    # Apply limit if specified
+    if args.limit > 0:
+        test_cases = test_cases[:args.limit]
+        print(f"\n⚡ Limited to {args.limit} test case(s) for quick testing")
+    
+    # Count multi-turn scenarios
+    multi_turn_count = sum(1 for tc in test_cases if tc.get("multi_turn", False))
+    single_turn_count = len(test_cases) - multi_turn_count
+    
+    print(f"\n📋 Running {len(test_cases)} test cases")
+    print(f"   - Single-turn: {single_turn_count}")
+    print(f"   - Multi-turn: {multi_turn_count}")
+    
+    # 5. Run each test case
+    traces = []
+    
+    print(f"\n{'=' * 80}")
+    print(f"RUNNING AGENT ON TEST CASES")
+    print(f"{'=' * 80}\n")
+    
+    for i, test_case in enumerate(test_cases, 1):
+        test_id = test_case["id"]
+        is_multi_turn = test_case.get("multi_turn", False)
+        customer_id = test_case.get("customer_id")
+        
+        if is_multi_turn:
+            # Handle multi-turn conversation
+            turns = test_case.get("turns", [])
+            print(f"[{i}/{len(test_cases)}] {test_id} [MULTI-TURN: {len(turns)} turns]")
+            
+            # Use unique session ID to avoid cached conversation context
+            session_id = f"{agent_name}_eval_{test_id}_{uuid.uuid4().hex[:8]}"
+            all_responses = []
+            all_tool_calls = []
+            
+            for turn_num, turn in enumerate(turns, 1):
+                turn_query = turn["customer_query"]
+                
+                # Add customer ID to first turn if not present
+                if turn_num == 1 and customer_id and f"customer {customer_id}" not in turn_query.lower():
+                    turn_query = f"I'm customer {customer_id}. {turn_query}"
+                
+                print(f"  Turn {turn_num}: {turn_query[:60]}...")
+                
+                try:
+                    import httpx
+                    
+                    async with httpx.AsyncClient() as client:
+                        response_obj = await client.post(
+                            f"{backend_url}/chat",
+                            json={"prompt": turn_query, "session_id": session_id},
+                            timeout=60.0
+                        )
+                        response_obj.raise_for_status()
+                        
+                        result = response_obj.json()
+                        response = result.get("response", "")
+                        tools_used = result.get("tools_used", [])
+                        
+                        all_responses.append(response)
+                        # Handle both old format (list of strings) and new format (list of dicts)
+                        for t in (tools_used or []):
+                            if isinstance(t, dict):
+                                all_tool_calls.append(t)
+                            else:
+                                all_tool_calls.append({"name": t, "args": {}})
+                        
+                        print(f"    → Response: {response[:60]}... | Tools: {len(tools_used or [])}")
+                        
+                except Exception as e:
+                    print(f"    ❌ Error in turn {turn_num}: {e}")
+                    all_responses.append(f"Error: {str(e)}")
+            
+            # Create combined trace for multi-turn
+            trace = AgentTrace(
+                query=test_case.get("customer_query", turns[0]["customer_query"] if turns else ""),
+                response="\n\n---\n\n".join(all_responses),
+                tool_calls=all_tool_calls,
+                metadata={
+                    "test_id": test_id,
+                    "agent_backend": backend_url,
+                    "session_id": session_id,
+                    "is_multi_turn": True,
+                    "turn_count": len(turns),
+                    "turn_responses": all_responses,
+                }
+            )
+            traces.append(trace)
+            
+        else:
+            # Handle single-turn conversation (original logic)
+            query = test_case["customer_query"]
+            
+            # Augment query with customer ID if available
+            if customer_id and f"customer {customer_id}" not in query.lower():
+                query = f"I'm customer {customer_id}. {query}"
+            
+            print(f"[{i}/{len(test_cases)}] {test_id}")
+            print(f"Query: {query[:80]}...")
+            
+            # Use unique session ID to avoid cached conversation context
+            session_id = f"{agent_name}_eval_{test_id}_{uuid.uuid4().hex[:8]}"
+            
+            try:
+                import httpx
+                
+                request_data = {
+                    "prompt": query,
+                    "session_id": session_id
+                }
+                
+                async with httpx.AsyncClient() as client:
+                    response_obj = await client.post(
+                        f"{backend_url}/chat",
+                        json=request_data,
+                        timeout=60.0
+                    )
+                    response_obj.raise_for_status()
+                    
+                    result = response_obj.json()
+                    response = result.get("response", "")
+                    tools_used = result.get("tools_used", [])
+                    
+                    # Handle both old format (list of strings) and new format (list of dicts)
+                    tool_calls = []
+                    for t in (tools_used or []):
+                        if isinstance(t, dict):
+                            tool_calls.append(t)
+                        else:
+                            tool_calls.append({"name": t, "args": {}})
+                
+                print(f"  ✓ Response: {response[:100]}...")
+                print(f"  ✓ Tools called: {len(tool_calls)}")
+                
+                trace = AgentTrace(
+                    query=test_case["customer_query"],
+                    response=response,
+                    tool_calls=tool_calls,
+                    metadata={
+                        "test_id": test_id,
+                        "agent_backend": backend_url,
+                        "session_id": session_id,
+                        "augmented_query": query,
+                        "is_multi_turn": False,
+                    }
+                )
+                traces.append(trace)
+                
+            except Exception as e:
+                print(f"  ❌ Error: {e}")
+                trace = AgentTrace(
+                    query=query,
+                    response=f"Error: {str(e)}",
+                    tool_calls=[],
+                    metadata={
+                        "test_id": test_id,
+                        "agent_backend": backend_url,
+                        "error": str(e),
+                        "is_multi_turn": False,
+                    }
+                )
+                traces.append(trace)
+        
+        print()
+    
+    # 6. Generate evaluation_input_data.jsonl for Foundry integration
+    print(f"{'=' * 80}")
+    print(f"GENERATING FOUNDRY DATA FILE")
+    print(f"{'=' * 80}\n")
+    
+    foundry_data_file = Path(__file__).parent / "evaluation_input_data.jsonl"
+    with open(foundry_data_file, 'w') as f:
+        for trace in traces:
+            # Extract test case data from metadata
+            test_id = trace.metadata.get("test_id", "unknown")
+            
+            # Find matching test case from original dataset
+            matching_test = None
+            for test_case in test_cases:
+                if test_case.get("id") == test_id:
+                    matching_test = test_case
+                    break
+            
+            # Prepare data in format expected by run_eval.py
+            foundry_row = {
+                "query": trace.query,
+                "response": trace.response,
+                "expected_tools": matching_test.get("expected_tools", []) if matching_test else [],
+                "required_tools": matching_test.get("required_tools", []) if matching_test else [],
+                "success_criteria": matching_test.get("success_criteria", {}) if matching_test else {},
+                "tool_calls": [{"name": tc["name"], "args": tc.get("args", {})} for tc in trace.tool_calls]
+            }
+            
+            f.write(json.dumps(foundry_row) + '\n')
+    
+    print(f"✓ Generated {foundry_data_file} with {len(traces)} evaluation rows")
+    
+    # 7. Run local evaluation (if --local or neither flag specified)
+    if run_local:
+        print(f"{'=' * 80}")
+        print(f"EVALUATING RESULTS (LOCAL)")
+        print(f"{'=' * 80}\n")
+        
+        runner = AgentEvaluationRunner(dataset_path=str(dataset_path))
+        summary = runner.run_evaluation(
+            traces,
+            output_dir=str(Path(__file__).parent / "eval_results")
+        )
+        
+        # Display summary
+        print(f"\n{'=' * 80}")
+        print(f"EVALUATION SUMMARY - {backend_url}")
+        print(f"{'=' * 80}")
+        print(f"Agent: {agent_name}")
+        print(f"Total Tests:    {summary['total_tests']}")
+        print(f"Passed:         {summary['passed']} ✓")
+        print(f"Failed:         {summary['failed']} ✗")
+        print(f"Pass Rate:      {summary['pass_rate']:.1%}")
+        print(f"Average Score:  {summary['average_score']:.2f}")
+        
+        # Show different metric emphasis for multi-turn vs single-turn
+        if args.multi_turn_only:
+            print(f"\n📊 Multi-Turn Metrics (outcome-focused, 1-5 scale, threshold: 3):")
+            outcome_metrics = ["solution_accuracy", "task_adherence", "intent_resolution", "coherence", "fluency", "relevance"]
+            for metric in outcome_metrics:
+                score = summary['metric_averages'].get(metric, 0)
+                bar = "█" * int(score * 4)
+                status = "✓" if score >= 3.0 else "✗"
+                print(f"  {metric:30s}: {score:4.1f}/5 {bar:20} {status}")
+        else:
+            print(f"\nMetric Breakdown (1-5 scale, threshold: 3):")
+            for metric, score in summary['metric_averages'].items():
+                bar = "█" * int(score * 4)  # Scale bar for 1-5 range (max 20 chars at score 5)
+                status = "✓" if score >= 3.0 else "✗"
+                print(f"  {metric:30s}: {score:4.1f}/5 {bar:20} {status}")
+        
+        print(f"\n{'=' * 80}")
+        print(f"✓ Local evaluation complete! Check eval_results/ for detailed reports.")
+        print(f"{'=' * 80}\n")
+    
+    # 8. Push to Azure AI Foundry if --remote flag is set
+    if run_remote:
+        print(f"{'=' * 80}")
+        print(f"PUSHING RESULTS TO AZURE AI FOUNDRY")
+        print(f"{'=' * 80}\n")
+        
+        # Check for required environment variable
+        project_endpoint = os.environ.get("AZURE_AI_PROJECT_ENDPOINT")
+        
+        if not project_endpoint:
+            print("❌ Missing AZURE_AI_PROJECT_ENDPOINT in .env file")
+            print("   Get this from: Azure AI Foundry → Your Project → Settings → Project details")
+            print("   Example: https://your-account.services.ai.azure.com/api/projects/your-project")
+            print("\n   Skipping remote evaluation...")
+        else:
+            # Determine eval type for naming
+            if args.multi_turn_only:
+                eval_type = "multi-turn"
+            elif args.single_turn_only:
+                eval_type = "single-turn"
+            else:
+                eval_type = "mixed"
+            
+            # Use the new Azure AI Projects SDK approach (azure-ai-projects>=2.0.0b1)
+            # This uses openai_client.evals API instead of azure.ai.evaluation.evaluate()
+            await run_foundry_evaluation(traces, foundry_data_file, agent_name, test_cases, eval_type)
+    
+    # Give async tasks time to cleanup
+    await asyncio.sleep(0.1)
+
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        print("\n\nEvaluation cancelled by user.")
+    finally:
+        # Ensure all async resources are cleaned up
+        pass
diff --git a/agentic_ai/evaluations/telemetry.py b/agentic_ai/evaluations/telemetry.py
new file mode 100644
index 000000000..166fe5536
--- /dev/null
+++ b/agentic_ai/evaluations/telemetry.py
@@ -0,0 +1,61 @@
+import os
+from typing import Optional
+
+try:  # Optional dependency; backend should still run without it
+    from azure.monitor.opentelemetry import configure_azure_monitor
+    from agent_framework.observability import setup_observability
+    from opentelemetry.sdk.resources import Resource
+except Exception:  # pragma: no cover - best-effort import
+    configure_azure_monitor = None  # type: ignore[assignment]
+    setup_observability = None  # type: ignore[assignment]
+    Resource = None  # type: ignore[assignment]
+
+
+def setup_telemetry() -> None:
+    """Configure Azure Monitor for Agent Framework to show traces in Foundry.
+
+    This follows the exact pattern from the Microsoft blog post:
+    "Agentic Applications on Azure Container Apps with Microsoft Foundry"
+    """
+    
+    print("🔍 DEBUG: setup_telemetry() called")
+    
+    connection_string: Optional[str] = os.getenv("APPLICATION_INSIGHTS_CONNECTION_STRING")
+    print(f"🔍 DEBUG: Application Insights connection string exists: {bool(connection_string)}")
+    
+    if not connection_string:
+        print("❌ DEBUG: No APPLICATION_INSIGHTS_CONNECTION_STRING found, skipping telemetry")
+        return
+        
+    if configure_azure_monitor is None:
+        print("❌ DEBUG: configure_azure_monitor not available, skipping telemetry")
+        return
+
+    try:
+        print("🚀 DEBUG: Calling configure_azure_monitor...")
+        # Configure Azure Monitor first (exact pattern from blog)
+        configure_azure_monitor(
+            resource=Resource.create({"service.name": "contoso-agent-backend"}) if Resource else None,
+            connection_string=connection_string,
+            disable_offline_storage=True,  # Disable storage to avoid the NoneType error
+        )
+        print("✅ DEBUG: configure_azure_monitor completed")
+        
+        # Enable Microsoft Agent Framework telemetry (correct function!)
+        if setup_observability is not None:
+            print("🚀 DEBUG: Calling setup_observability...")
+            setup_observability(enable_sensitive_data=False)
+            print("✅ DEBUG: setup_observability completed")
+        else:
+            print("❌ DEBUG: setup_observability not available")
+            
+        print("🎉 DEBUG: Telemetry setup complete!")
+        
+    except Exception as e:
+        print(f"❌ DEBUG: Telemetry setup failed with error: {e}")
+        import traceback
+        traceback.print_exc()
+            
+    except Exception:
+        # Telemetry should never break the app; swallow configuration errors.
+        return
\ No newline at end of file
diff --git a/infra/ppt.md b/infra/ppt.md
new file mode 100644
index 000000000..9163de1f1
--- /dev/null
+++ b/infra/ppt.md
@@ -0,0 +1,107 @@
+# Enterprise-Ready Agentic AI Architecture
+
+**From prototype to production: a secure, end-to-end blueprint for agentic AI on Azure**
+
+---
+
+## What We Added
+
+| Feature | Description |
+|---------|-------------|
+| ✅ End-to-end agentic AI reference architecture | Complete stack from MCP tools → Agent orchestration → Backend → Frontend |
+| ✅ Enterprise security by default | VNet integration, private endpoints, zero-trust managed identity |
+| ✅ No secrets, no public exposure | Internal MCP, RBAC everywhere, HTTPS ingress only |
+| ✅ Production-ready automation | Terraform/Bicep IaC + GitHub Actions CI/CD with OIDC |
+
+## Why It Matters
+
+| Gap | Solution |
+|-----|----------|
+| ❗ Industry lacks clear guidance for enterprise-grade agentic AI | ✅ Repeatable, opinionated blueprint from Dev → Prod |
+
+---
+
+## Architecture Diagram
+
+```mermaid
+flowchart TB
+
+    %% User / Client Layer
+    User["👤 Users & Apps<br/>Web / Enterprise Clients"]
+    User -->|"🔒 HTTPS"| FE["🌐 Public Entry<br/>Azure Container Apps<br/>Managed TLS"]
+
+    %% Enterprise VNet Boundary
+    subgraph VNET["🛡️ Enterprise VNet - Network Isolated"]
+        direction TB
+
+        %% Agentic AI Layer
+        subgraph AGENTS["🤖 Agentic AI Layer"]
+            BE["⚙️ Agent Orchestrator<br/>Backend Agent<br/>Managed Identity"]
+            MCP["🔧 MCP Service<br/>Internal Only<br/>No Public Ingress"]
+            BE -->|"Internal HTTP"| MCP
+        end
+
+        %% Platform & Data Layer
+        subgraph PLATFORM["☁️ Platform & Data Layer"]
+            AOAI["🧠 Azure OpenAI<br/>Private Endpoint<br/>RBAC Access"]
+            COSMOS["💾 Cosmos DB<br/>Private Endpoint<br/>RBAC Data Plane"]
+            ACR["📦 Container Registry<br/>AcrPull via Identity"]
+        end
+
+        %% Security & Ops
+        subgraph SECURITY["🔐 Security & Operations"]
+            MI["🎫 Managed Identity<br/>No API Keys"]
+            RBAC["👥 Azure RBAC<br/>Least Privilege"]
+            CICD["🚀 GitHub Actions<br/>OIDC Auth"]
+        end
+
+        %% Connections
+        FE --> BE
+        BE --> AOAI
+        MCP --> COSMOS
+
+        BE -.->|"auth"| MI
+        MCP -.->|"auth"| MI
+        MI -.-> AOAI
+        MI -.-> COSMOS
+
+        ACR -.->|"pull"| BE
+        ACR -.->|"pull"| MCP
+    end
+
+    %% Environments
+    subgraph ENV["📊 Security Profiles"]
+        DEV["🟢 Dev<br/>Minimal Security"]
+        STAGE["🟡 Staging<br/>VNet + Internal MCP"]
+        PROD["🔴 Prod<br/>Full Zero Trust"]
+    end
+
+    CICD --> DEV
+    CICD --> STAGE
+    CICD --> PROD
+
+    %% Guidance Gap
+    GAP["⚠️ Industry Gap<br/>Most samples stop at PoC<br/>No VNet • API Keys<br/>Public AI & DB"]
+    User -.->|"❌ Don't do this"| GAP
+
+    %% Styling - Vibrant colors
+    classDef user fill:#1976D2,stroke:#0D47A1,stroke-width:3px,color:#fff,font-weight:bold
+    classDef entry fill:#43A047,stroke:#1B5E20,stroke-width:3px,color:#fff,font-weight:bold
+    classDef agents fill:#FF9800,stroke:#E65100,stroke-width:2px,color:#000,font-weight:bold
+    classDef platform fill:#9C27B0,stroke:#4A148C,stroke-width:2px,color:#fff,font-weight:bold
+    classDef security fill:#00ACC1,stroke:#006064,stroke-width:2px,color:#fff,font-weight:bold
+    classDef envDev fill:#4CAF50,stroke:#2E7D32,stroke-width:2px,color:#fff
+    classDef envStage fill:#FFC107,stroke:#FF8F00,stroke-width:2px,color:#000
+    classDef envProd fill:#F44336,stroke:#B71C1C,stroke-width:2px,color:#fff
+    classDef gap fill:#FFCDD2,stroke:#D32F2F,stroke-width:3px,stroke-dasharray:5 5,color:#B71C1C,font-weight:bold
+
+    class User user
+    class FE entry
+    class BE,MCP agents
+    class AOAI,COSMOS,ACR platform
+    class MI,RBAC,CICD security
+    class DEV envDev
+    class STAGE envStage
+    class PROD envProd
+    class GAP gap
+```
\ No newline at end of file
diff --git a/infra/workshop_summary.md b/infra/workshop_summary.md
new file mode 100644
index 000000000..e503fe30a
--- /dev/null
+++ b/infra/workshop_summary.md
@@ -0,0 +1,113 @@
+# Enterprise-Ready Agentic AI  Workshop
+
+**Build and deploy secure, end-to-end agentic AI solutions on Azure**
+
+---
+
+## Who Is This For
+
+Infrastructure engineers and enterprise architects with in-depth Azure knowledge who need to deploy agentic AI in an enterprise-grade manner.
+
+---
+
+## What You'll Learn
+
+- ✅ **End-to-end agentic architecture** — Database → MCP tools → Agent orchestration → Backend → Frontend
+- ✅ **Your choice of IaC** — Bicep or Terraform, manual scripts or GitHub Actions
+- ✅ **Modern identity principles** — OIDC for GitHub Actions, Managed Identity for Azure services (no keys)
+- ✅ **Network isolation** — VNet with private endpoints, only frontend exposed to internet
+- ✅ **Automated CI/CD pipelines** — GitHub Actions, parallel container builds, integration testing, multi-environment deployment
+- ✅ **Enterprise-ready template** — Scalable, reusable blueprint for standalone or landing zone deployment
+
+---
+
+## Why It Matters
+
+Most agentic AI samples stop at proof-of-concept — public endpoints, API keys, no network isolation. This workshop provides a **repeatable, production-ready blueprint** from Dev → Prod.
+
+---
+
+## Architecture Diagram
+
+```mermaid
+flowchart TB
+
+    %% User / Client Layer
+    User["👤 Users & Apps<br/>Web / Enterprise Clients"]
+    User -->|"🔒 HTTPS"| FE["🌐 Public Entry<br/>Azure Container Apps<br/>Managed TLS"]
+
+    %% Enterprise VNet Boundary
+    subgraph VNET["🛡️ Enterprise VNet - Network Isolated"]
+        direction TB
+
+        %% Agentic AI Layer
+        subgraph AGENTS["🤖 Agentic AI Layer"]
+            BE["⚙️ Agent Orchestrator<br/>Backend Agent<br/>Managed Identity"]
+            MCP["🔧 MCP Service<br/>Internal Only<br/>No Public Ingress"]
+            BE -->|"Internal HTTP"| MCP
+        end
+
+        %% Platform & Data Layer
+        subgraph PLATFORM["☁️ Platform & Data Layer"]
+            AOAI["🧠 Azure OpenAI<br/>Private Endpoint<br/>RBAC Access"]
+            COSMOS["💾 Cosmos DB<br/>Private Endpoint<br/>RBAC Data Plane"]
+            ACR["📦 Container Registry<br/>AcrPull via Identity"]
+        end
+
+        %% Security & Ops
+        subgraph SECURITY["🔐 Security & Operations"]
+            MI["🎫 Managed Identity<br/>No API Keys"]
+            RBAC["👥 Azure RBAC<br/>Least Privilege"]
+            CICD["🚀 GitHub Actions<br/>OIDC Auth"]
+        end
+
+        %% Connections
+        FE --> BE
+        BE --> AOAI
+        MCP --> COSMOS
+
+        BE -.->|"auth"| MI
+        MCP -.->|"auth"| MI
+        MI -.-> AOAI
+        MI -.-> COSMOS
+
+        ACR -.->|"pull"| BE
+        ACR -.->|"pull"| MCP
+    end
+
+    %% Environments
+    subgraph ENV["📊 Security Profiles"]
+        DEV["🟢 Dev<br/>Minimal Security"]
+        STAGE["🟡 Staging<br/>VNet + Internal MCP"]
+        PROD["🔴 Prod<br/>Full Zero Trust"]
+    end
+
+    CICD --> DEV
+    CICD --> STAGE
+    CICD --> PROD
+
+    %% Guidance Gap
+    GAP["⚠️ Industry Gap<br/>Most samples stop at PoC<br/>No VNet • API Keys<br/>Public AI & DB"]
+    User -.->|"❌ Don't do this"| GAP
+
+    %% Styling - Vibrant colors
+    classDef user fill:#1976D2,stroke:#0D47A1,stroke-width:3px,color:#fff,font-weight:bold
+    classDef entry fill:#43A047,stroke:#1B5E20,stroke-width:3px,color:#fff,font-weight:bold
+    classDef agents fill:#FF9800,stroke:#E65100,stroke-width:2px,color:#000,font-weight:bold
+    classDef platform fill:#9C27B0,stroke:#4A148C,stroke-width:2px,color:#fff,font-weight:bold
+    classDef security fill:#00ACC1,stroke:#006064,stroke-width:2px,color:#fff,font-weight:bold
+    classDef envDev fill:#4CAF50,stroke:#2E7D32,stroke-width:2px,color:#fff
+    classDef envStage fill:#FFC107,stroke:#FF8F00,stroke-width:2px,color:#000
+    classDef envProd fill:#F44336,stroke:#B71C1C,stroke-width:2px,color:#fff
+    classDef gap fill:#FFCDD2,stroke:#D32F2F,stroke-width:3px,stroke-dasharray:5 5,color:#B71C1C,font-weight:bold
+
+    class User user
+    class FE entry
+    class BE,MCP agents
+    class AOAI,COSMOS,ACR platform
+    class MI,RBAC,CICD security
+    class DEV envDev
+    class STAGE envStage
+    class PROD envProd
+    class GAP gap
+```
\ No newline at end of file
diff --git a/mcp/data/contoso.db b/mcp/data/contoso.db
index d52222bb1..e5397adf1 100644
Binary files a/mcp/data/contoso.db and b/mcp/data/contoso.db differ
diff --git a/tests/pytest.ini b/tests/pytest.ini
index 3c21701d9..3d63c2667 100644
--- a/tests/pytest.ini
+++ b/tests/pytest.ini
@@ -2,9 +2,14 @@
 markers =
     integration: marks tests as integration tests (require deployed services)
     unit: marks tests as unit tests (run locally without external services)
+    evaluation: marks tests as agent evaluation tests
+    slow: marks tests as slow-running (may incur API costs)
 
 # Default options
 addopts = -v --tb=short
 
 # Timeout for individual tests (in seconds)
 timeout = 300
+
+# Async mode for pytest-asyncio
+asyncio_mode = auto
diff --git a/tests/requirements.txt b/tests/requirements.txt
index aa6aa5bc9..09012f1a3 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -5,4 +5,8 @@ pytest-timeout
 requests
 azure-identity
 azure-keyvault-secrets
-fastmcp
\ No newline at end of file
+fastmcp
+
+# Agent Evaluation dependencies
+azure-ai-evaluation
+python-dotenv
\ No newline at end of file