eval-protocol
diff --git a/‎eval_protocol/mcp/client/connection.py‎
Lines changed: 6 additions & 11 deletions b/‎eval_protocol/mcp/client/connection.py‎
Lines changed: 6 additions & 11 deletions
diff --git a/‎eval_protocol/mcp/execution/base_policy.py‎
Lines changed: 1 addition & 1 deletion b/‎eval_protocol/mcp/execution/base_policy.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎eval_protocol/mcp/execution/manager.py‎
Lines changed: 30 additions & 14 deletions b/‎eval_protocol/mcp/execution/manager.py‎
Lines changed: 30 additions & 14 deletions
@@ -9,14 +9,16 @@
 import hashlib
 import json
 import logging
+import time
 from contextlib import AsyncExitStack
 from typing import Any, Dict, List, Optional, Tuple
 
+import httpx
 from mcp.client.session import ClientSession
 from mcp.client.streamable_http import streamablehttp_client
+from mcp.types import Implementation
 
 from ...types import MCPSession
-from mcp.types import Implementation
 
 logger = logging.getLogger(__name__)
 
@@ -109,15 +111,13 @@ async def reset_session(self, session: MCPSession) -> None:
         """
         Clean session data in remote mcp server for the given session
         """
-        import httpx
-
         base_url = session.base_url.rstrip("/").removesuffix("/mcp")
         url = f"{base_url}/control/reset_session"
 
         headers = {"mcp-session-id": session.session_id}
         body = {"seed": session.seed}
 
-        timeout = httpx.Timeout(3.0)
+        timeout = httpx.Timeout(15.0)
         async with httpx.AsyncClient(timeout=timeout) as client:
             resp = await client.post(url, headers=headers, json=body)
             resp.raise_for_status()
@@ -202,8 +202,6 @@ async def get_initial_state(self, session: MCPSession) -> Any:
         initial_observation = None
 
         try:
-            import httpx
-
             # Extract base URL and session ID from the MCP session
             base_url = session.base_url.rstrip("/").removesuffix("/mcp")
             session_id = session.session_id
@@ -459,9 +457,6 @@ async def call_tool(self, session: MCPSession, tool_name: str, arguments: Dict)
         control_plane_info = {}
 
         try:
-            # Query control plane endpoints following the new architecture
-            import httpx
-
             # Extract base URL and session ID from the MCP session
             base_url = session.base_url.rstrip("/").removesuffix("/mcp")
             # Use the session ID from the established MCP session
@@ -544,10 +539,10 @@ async def close_session(self, session: MCPSession) -> None:
                 await session._exit_stack.aclose()
             except asyncio.CancelledError:
                 # Handle cancellation gracefully (especially important for Python 3.12)
-                logger.debug(f"Session {session.session_id} close was cancelled")
+                logger.error(f"Session {session.session_id} close was cancelled")
             except Exception as e:
                 # Hitting this error, probably because of use of threads: "Attempted to exit cancel scope in a different task than it was entered in"
-                logger.debug(f"Error closing session {session.session_id}: {e}")
+                logger.error(f"Error closing session {session.session_id}: {e}")
             finally:
                 session._exit_stack = None
                 session._mcp_session = None
@@ -220,7 +220,7 @@ async def _generate_live_tool_calls(
             return mcp_tool_calls, usage_stats
         else:
             # No tool calls in response - this is normal when episode ends or LLM provides only text
-            logger.info(f"No tool calls in response for env {env_index}, message content: {message.get('content')}")
+            logger.debug(f"No tool calls in response for env {env_index}, message content: {message.get('content')}")
             return [
                 MCPToolCall(
                     tool_name="_no_tool_call",
 
@@ -11,8 +11,7 @@
 import os
 import threading
 import time
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from dataclasses import asdict, dataclass
+from dataclasses import asdict
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
 from openai.types import CompletionUsage
@@ -98,10 +97,12 @@ async def execute_rollouts(
 
         async def _execute_with_semaphore(idx):
             async with semaphore:
-                return await self._execute_rollout(
+                result = await self._execute_rollout(
                     envs, policy, idx, steps, openai_logger, recording_mode, playback_mode, start_time
                 )
 
+                return result
+
         tasks = [_execute_with_semaphore(i) for i in range(envs.n)]
         # exceptions will be try catched inside single _execute_rollout
         trajectories = await asyncio.gather(*tasks)
@@ -113,9 +114,6 @@ async def _execute_with_semaphore(idx):
 
         shared_tool_schema = envs.tool_schemas
 
-        # Clean up
-        await envs.close()
-
         # Enhanced reporting with control plane info
         successful = sum(1 for traj in trajectories if traj.total_reward > 0)
         terminated_by_control_plane = sum(
@@ -176,8 +174,11 @@ async def _execute_with_semaphore(idx):
                     TerminationReason.USER_STOP,
                 }:
                     evaluation_rows[idx].rollout_status.status = "finished"
-                elif trajectory.termination_reason == TerminationReason.MAX_STEPS:
+                elif trajectory.termination_reason in {TerminationReason.MAX_STEPS, TerminationReason.INTERRUPTED}:
                     evaluation_rows[idx].rollout_status.status = "stopped"
+                    evaluation_rows[idx].rollout_status.error_message = trajectory.control_plane_summary.get(
+                        "termination_reason", trajectory.termination_reason
+                    )
                 else:
                     evaluation_rows[idx].rollout_status.status = "error"
                     evaluation_rows[idx].rollout_status.error_message = trajectory.control_plane_summary.get(
@@ -227,6 +228,7 @@ async def _execute_rollout(
                 "total_tokens": 0,
             },
         )
+        failure_reason = None
         try:
             current_observation, tool_schema = await envs.reset(session)
             system_prompt = dataset_row.system_prompt
@@ -248,7 +250,7 @@ async def _execute_rollout(
 
                 # Get initial messages in tau2-bench format for user simulator
                 user_simulator_state = user_simulator.get_init_state()
-                user_message, user_simulator_state = user_simulator.generate_next_message(
+                user_message, user_simulator_state = await user_simulator.generate_next_message(
                     AssistantMessage(role="assistant", content="Hi! How can I help you today?"),
                     user_simulator_state,
                 )
@@ -280,7 +282,7 @@ async def _execute_rollout(
                     # Last message was agent, simulated user response
                     if user_simulator_messages and isinstance(user_simulator_messages[-1], AssistantMessage):
                         # Generate user response using the simulator
-                        user_message, user_simulator_state = user_simulator.generate_next_message(
+                        user_message, user_simulator_state = await user_simulator.generate_next_message(
                             user_simulator_messages[-1], user_simulator_state
                         )
                         user_content = user_message.content if user_message.content else ""
@@ -312,8 +314,7 @@ async def _execute_rollout(
                         # If there's no user simulator, no tool call means policy failed and we should terminate the rollout
                         elif tool_calls[0].tool_name in ["_playback_terminate", "_no_tool_call"]:
                             trajectory.terminated = True
-                            trajectory.termination_reason = TerminationReason.ERROR
-                            trajectory.control_plane_summary.update({"error_message": "No expected tool call"})
+                            trajectory.termination_reason = TerminationReason.INTERRUPTED
                             break
 
                     # Execute each tool call sequentially
@@ -467,11 +468,26 @@ async def _execute_rollout(
             logger.info(
                 f"✅ Rollout {rollout_idx} completed: {trajectory.steps} steps, reward: {trajectory.total_reward:.2f}, termination: {trajectory.termination_reason}, in thread {threading.current_thread().name}"
             )
+
+        except asyncio.CancelledError:
+            logger.error(f"🚨 AsyncIO Cancel Error in roll out {rollout_idx}", exc_info=True)
+            failure_reason = "asyncio context cancelled"
         except Exception as e:
             logger.error(f"🚨 Error in rollout {rollout_idx}: {e}", exc_info=True)
-            trajectory.terminated = True
-            trajectory.termination_reason = TerminationReason.ERROR
-            trajectory.control_plane_summary.update({"error_message": str(e)})
+            failure_reason = str(e)
+        finally:
+            if failure_reason:
+                trajectory.terminated = True
+                trajectory.termination_reason = TerminationReason.ERROR
+                trajectory.control_plane_summary.update({"error_message": f"{failure_reason}"})
+            try:
+                await envs.connection_manager.reset_session(session)
+            except:
+                logger.error(f"Error resetting session {session.session_id}")
+            try:
+                await envs.connection_manager.close_session(session)
+            except:
+                logger.error(f"Error closing session {session.session_id}")
         return trajectory
 
     async def _get_control_plane_status(self, session) -> Optional[Dict[str, Any]]: