eval-protocol · benjibc · Aug 14, 2025 · Aug 14, 2025 · Copilot · Aug 14, 2025
diff --git a/eval_protocol/pytest/default_agent_rollout_processor.py b/eval_protocol/pytest/default_agent_rollout_processor.py
@@ -74,14 +74,9 @@ async def call_agent(self) -> str:
 
             # Add all tool results to messages (they will be in the same order as tool_calls)
             for tool_call, (tool_call_id, content) in zip(message.tool_calls, tool_results):
+                tool_message_content = self._format_tool_message_content(content)
                 self.append_message_and_log(
-                    Message(
-                        role="tool",
-                        content=[
-                            ChatCompletionContentPartTextParam(text=content.text, type="text") for content in content
-                        ],
-                        tool_call_id=tool_call_id,
-                    )
+                    Message(role="tool", content=tool_message_content, tool_call_id=tool_call_id)
                 )
             return await self.call_agent()
         return message.content
@@ -114,6 +109,18 @@ def _get_content_from_tool_result(self, tool_result: CallToolResult) -> List[Tex
             raise NotImplementedError("Non-text content is not supported yet")
         return tool_result.content
 
+    def _format_tool_message_content(
+        self, content: List[TextContent]
+    ) -> Union[str, List[ChatCompletionContentPartTextParam]]:
+        """Format tool result content for inclusion in a tool message.
+
+        - If a single text item, return plain string per OpenAI semantics.
+        - If multiple items, return a list of text parts.
+        """
+        if len(content) == 1 and isinstance(content[0], TextContent):
+            return content[0].text
+        return [ChatCompletionContentPartTextParam(text=c.text, type="text") for c in content]
+
 
 async def default_agent_rollout_processor(
     rows: List[EvaluationRow], config: RolloutProcessorConfig

diff --git a/tests/pytest/test_tool_response_single_string.py b/tests/pytest/test_tool_response_single_string.py
@@ -0,0 +1,40 @@
+import asyncio
+from typing import List, Optional
+
+from mcp.types import TextContent
+from openai.types.chat.chat_completion_message import (
+    ChatCompletionMessageToolCall,
+    FunctionCall,
+)
+
+from eval_protocol.models import EvaluationRow, Message
+from eval_protocol.pytest.default_agent_rollout_processor import Agent
+
+
+class NoOpLogger:
+    def log(self, row: EvaluationRow) -> None:
+        return None
+
+    def read(self, row_id: Optional[str] = None) -> List[EvaluationRow]:
+        return []
+
+
+def test_tool_result_single_text_becomes_string():
+    # Prepare a minimal evaluation row and agent
+    row = EvaluationRow(messages=[Message(role="user", content="use the tool")])
+    agent = Agent(model="dummy", row=row, config_path="", logger=NoOpLogger())
+
+    # Single text content becomes a plain string
+    single = [TextContent(type="text", text="single result")]
+    formatted = agent._format_tool_message_content(single)
+    assert isinstance(formatted, str)
+    assert formatted == "single result"
+
+    # Multiple text contents become a list of text parts
+    multiple = [
+        TextContent(type="text", text="first"),
+        TextContent(type="text", text="second"),
+    ]
+    formatted_multi = agent._format_tool_message_content(multiple)
+    assert isinstance(formatted_multi, list)
+    assert [part["text"] for part in formatted_multi] == ["first", "second"]