diff --git a/eval_protocol/pytest/default_agent_rollout_processor.py b/eval_protocol/pytest/default_agent_rollout_processor.py index 50f12231..57b3ef73 100644 --- a/eval_protocol/pytest/default_agent_rollout_processor.py +++ b/eval_protocol/pytest/default_agent_rollout_processor.py @@ -74,14 +74,9 @@ async def call_agent(self) -> str: # Add all tool results to messages (they will be in the same order as tool_calls) for tool_call, (tool_call_id, content) in zip(message.tool_calls, tool_results): + tool_message_content = self._format_tool_message_content(content) self.append_message_and_log( - Message( - role="tool", - content=[ - ChatCompletionContentPartTextParam(text=content.text, type="text") for content in content - ], - tool_call_id=tool_call_id, - ) + Message(role="tool", content=tool_message_content, tool_call_id=tool_call_id) ) return await self.call_agent() return message.content @@ -114,6 +109,18 @@ def _get_content_from_tool_result(self, tool_result: CallToolResult) -> List[Tex raise NotImplementedError("Non-text content is not supported yet") return tool_result.content + def _format_tool_message_content( + self, content: List[TextContent] + ) -> Union[str, List[ChatCompletionContentPartTextParam]]: + """Format tool result content for inclusion in a tool message. + + - If a single text item, return plain string per OpenAI semantics. + - If multiple items, return a list of text parts. + """ + if len(content) == 1 and isinstance(content[0], TextContent): + return content[0].text + return [ChatCompletionContentPartTextParam(text=c.text, type="text") for c in content] + async def default_agent_rollout_processor( rows: List[EvaluationRow], config: RolloutProcessorConfig diff --git a/tests/pytest/test_tool_response_single_string.py b/tests/pytest/test_tool_response_single_string.py new file mode 100644 index 00000000..87d1c391 --- /dev/null +++ b/tests/pytest/test_tool_response_single_string.py @@ -0,0 +1,40 @@ +import asyncio +from typing import List, Optional + +from mcp.types import TextContent +from openai.types.chat.chat_completion_message import ( + ChatCompletionMessageToolCall, + FunctionCall, +) + +from eval_protocol.models import EvaluationRow, Message +from eval_protocol.pytest.default_agent_rollout_processor import Agent + + +class NoOpLogger: + def log(self, row: EvaluationRow) -> None: + return None + + def read(self, row_id: Optional[str] = None) -> List[EvaluationRow]: + return [] + + +def test_tool_result_single_text_becomes_string(): + # Prepare a minimal evaluation row and agent + row = EvaluationRow(messages=[Message(role="user", content="use the tool")]) + agent = Agent(model="dummy", row=row, config_path="", logger=NoOpLogger()) + + # Single text content becomes a plain string + single = [TextContent(type="text", text="single result")] + formatted = agent._format_tool_message_content(single) + assert isinstance(formatted, str) + assert formatted == "single result" + + # Multiple text contents become a list of text parts + multiple = [ + TextContent(type="text", text="first"), + TextContent(type="text", text="second"), + ] + formatted_multi = agent._format_tool_message_content(multiple) + assert isinstance(formatted_multi, list) + assert [part["text"] for part in formatted_multi] == ["first", "second"]