From fb5ab0228aa2ef38d7d04254c11ecc0fe8184408 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Sat, 16 Aug 2025 22:10:50 -0700 Subject: [PATCH 1/2] fix err msg --- eval_protocol/mcp/execution/manager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py index 1a36afef..64998bd6 100644 --- a/eval_protocol/mcp/execution/manager.py +++ b/eval_protocol/mcp/execution/manager.py @@ -15,6 +15,7 @@ from typing import TYPE_CHECKING, Any, AsyncIterator, Callable, Dict, List, Optional, Union import anyio +import httpx from openai.types import CompletionUsage from vendor.tau2.data_model.message import AssistantMessage, UserMessage From 6f0de6cea4ebd57c8d3a9519c077ad884ec6ac21 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Sun, 17 Aug 2025 00:00:19 -0700 Subject: [PATCH 2/2] fix trajectory collection --- eval_protocol/mcp/execution/manager.py | 32 +++++++++++++++----------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py index 64998bd6..5c21806e 100644 --- a/eval_protocol/mcp/execution/manager.py +++ b/eval_protocol/mcp/execution/manager.py @@ -222,7 +222,7 @@ async def _execute_rollout( current_observation = user_message.content if user_message.content else "" user_prompt = envs.format_user_prompt(rollout_idx, current_observation) - conversation_history = [ + trajectory.conversation_history = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}, ] @@ -242,7 +242,7 @@ async def _execute_rollout( if user_simulator and user_simulator_state: # Get user simulator messages and find the last assistant message - user_simulator_messages = self._get_user_simulator_messages(conversation_history) + user_simulator_messages = self._get_user_simulator_messages(trajectory.conversation_history) # Last message was agent, simulated user response if user_simulator_messages and isinstance(user_simulator_messages[-1], AssistantMessage): @@ -253,7 +253,7 @@ async def _execute_rollout( user_content = user_message.content if user_message.content else "" user_prompt = envs.format_user_prompt(rollout_idx, user_content) - conversation_history.append({"role": "user", "content": user_prompt}) + trajectory.conversation_history.append({"role": "user", "content": user_prompt}) # Check if user simulator signaled termination if UserSimulator.is_stop(user_message): @@ -263,7 +263,7 @@ async def _execute_rollout( # In each turn: keep looping until assistant is ready to provide final response while not turn_completed and not trajectory.terminated: tool_calls, usage_stats, finish_reason = await policy( - tool_schema, rollout_idx, conversation_history + tool_schema, rollout_idx, trajectory.conversation_history ) # calc llm usage stats happened in this turn if there is aany @@ -295,7 +295,7 @@ async def _execute_rollout( rollout_idx, tool_call, tool_response, - conversation_history, + trajectory.conversation_history, reward, env_end, info, @@ -326,12 +326,14 @@ async def _execute_rollout( "num_tool_calls": 1, } print(f"🔍 control_plane_step: {control_plane_step}") - conversation_history[-1]["control_plane_step"] = control_plane_step + trajectory.conversation_history[-1]["control_plane_step"] = control_plane_step trajectory.control_plane_steps.append(control_plane_step) # Log conversation state for playback if in recording mode if recording_mode: - policy.log_conversation_state_for_playback(rollout_idx, step - 1, conversation_history) + policy.log_conversation_state_for_playback( + rollout_idx, step - 1, trajectory.conversation_history + ) if env_end: # if the env marks the end of the rollout, break the tool call loop @@ -365,17 +367,21 @@ async def _execute_rollout( "tool_calls": tool_calls_summary, "num_tool_calls": len(tool_calls), } - conversation_history[-1]["control_plane_step"] = control_plane_step + trajectory.conversation_history[-1]["control_plane_step"] = control_plane_step trajectory.control_plane_steps.append(control_plane_step) # Log conversation state for playback if in recording mode if recording_mode: - policy.log_conversation_state_for_playback(rollout_idx, step - 1, conversation_history) + policy.log_conversation_state_for_playback( + rollout_idx, step - 1, trajectory.conversation_history + ) # if the env marks end, update control plane summary and do one last policy call, then break the agent loop # this is to ensure each turn ends with an assistant message, which will align with the actual agentic llm behavior if env_end: - _, usage_stats, finish_reason = await policy(tool_schema, rollout_idx, conversation_history) + _, usage_stats, finish_reason = await policy( + tool_schema, rollout_idx, trajectory.conversation_history + ) if usage_stats: trajectory.usage["prompt_tokens"] += usage_stats.prompt_tokens trajectory.usage["completion_tokens"] += usage_stats.completion_tokens @@ -393,10 +399,10 @@ async def _execute_rollout( # Log final OpenAI conversation for terminated trajectories only if openai_logger: - if conversation_history and len(conversation_history) > 0: + if trajectory.conversation_history and len(trajectory.conversation_history) > 0: openai_logger( { - "messages": conversation_history, + "messages": trajectory.conversation_history, "metadata": { "session_id": session.session_id, "seed": session.seed, @@ -422,8 +428,6 @@ async def _execute_rollout( if not trajectory.termination_reason and step >= steps: trajectory.termination_reason = TerminationReason.MAX_STEPS - trajectory.conversation_history = conversation_history - # Add termination_reason to the final control_plane_step for msg in reversed(trajectory.conversation_history): if msg.get("control_plane_step"):