diff --git a/evalbench/generators/models/claude_code.py b/evalbench/generators/models/claude_code.py index 4318e48e..20e76bd2 100644 --- a/evalbench/generators/models/claude_code.py +++ b/evalbench/generators/models/claude_code.py @@ -478,7 +478,9 @@ def _parse_stream_json(self, stream_output: str) -> str: """Parses Claude Code stream-json output into a normalized format compatible with the eval pipeline.""" - final_obj = {"session_id": "", "response": "", "stats": {}} + final_obj = {"session_id": "", "response": "", "stats": {}, "tool_calls": []} + tool_calls = final_obj["tool_calls"] + calls_by_id = {} tool_uses = {} tool_results = {} # Fall back to the configured model if the stream's `system` init @@ -516,18 +518,33 @@ def _parse_stream_json(self, stream_output: str) -> str: # trajectory matcher can compare across # harnesses without per-generator logic. raw_name = block.get("name", "unknown") + tname = canonicalize_claude_tool_name(raw_name) tool_uses[tool_id] = { - "tool_name": canonicalize_claude_tool_name(raw_name), + "tool_name": tname, "parameters": block.get("input", {}), } + call = { + "tool_id": tool_id, + "tool_name": tname, + "parameters": block.get("input", {}), + "status": None, + "response": None, + } + tool_calls.append(call) + calls_by_id[tool_id] = call elif event_type == "tool_result": tool_id = event.get("tool_use_id") or event.get("id", "") is_error = event.get("is_error", False) + status = "error" if is_error else "success" tool_results[tool_id] = { - "status": "error" if is_error else "success", + "status": status, "content": event.get("content", ""), } + if tool_id in calls_by_id: + call = calls_by_id[tool_id] + call["status"] = status + call["response"] = event.get("content", "") elif event_type == "result": if "session_id" in event: diff --git a/evalbench/generators/models/codex_cli.py b/evalbench/generators/models/codex_cli.py index 1793b423..77866f8f 100644 --- a/evalbench/generators/models/codex_cli.py +++ b/evalbench/generators/models/codex_cli.py @@ -781,7 +781,9 @@ def _parse_stream_json( """ tool_durations = tool_durations or {} - final_obj = {"session_id": "", "response": "", "stats": {}} + final_obj = {"session_id": "", "response": "", "stats": {}, "tool_calls": []} + tool_calls = final_obj["tool_calls"] + calls_by_id = {} tool_uses: dict[str, dict] = {} tool_results: dict[str, dict] = {} usage: dict = {} @@ -853,8 +855,24 @@ def item_payload(item: dict) -> dict: # can compare across harnesses without per-generator logic. server = payload.get("server", "") tool = payload.get("tool", "unknown") + tname = canonical_tool_name(server, tool) + + if item_id not in calls_by_id: + call = { + "tool_id": item_id, + "tool_name": tname, + "parameters": self._coerce_json(payload.get("arguments", {})), + "status": None, + "response": None, + } + tool_calls.append(call) + calls_by_id[item_id] = call + else: + call = calls_by_id[item_id] + call["parameters"] = self._coerce_json(payload.get("arguments", {})) + tool_uses[item_id] = { - "tool_name": canonical_tool_name(server, tool), + "tool_name": tname, "server": server, "parameters": self._coerce_json(payload.get("arguments", {})), } @@ -863,44 +881,100 @@ def item_payload(item: dict) -> dict: is_error = bool(payload.get("error")) or status not in ( "", "completed", "success", "ok", ) + tstatus = "error" if is_error else "success" tool_results[item_id] = { - "status": "error" if is_error else "success", + "status": tstatus, "content": payload.get("result", ""), } + call["status"] = tstatus + call["response"] = payload.get("result", "") elif kind == "command_execution": + cmd = payload.get("command", "") + if item_id not in calls_by_id: + call = { + "tool_id": item_id, + "tool_name": "shell", + "parameters": {"command": cmd}, + "status": None, + "response": None, + } + tool_calls.append(call) + calls_by_id[item_id] = call + else: + call = calls_by_id[item_id] + call["parameters"] = {"command": cmd} + tool_uses[item_id] = { "tool_name": "shell", - "parameters": {"command": payload.get("command", "")}, + "parameters": {"command": cmd}, } if event_type == self._EV_ITEM_COMPLETED: exit_code = payload.get("exit_code") is_error = bool(exit_code) and exit_code != 0 + tstatus = "error" if is_error else "success" tool_results[item_id] = { - "status": "error" if is_error else "success", + "status": tstatus, "content": payload.get("aggregated_output", ""), } + call["status"] = tstatus + call["response"] = payload.get("aggregated_output", "") elif kind == "web_search": + q = payload.get("query", "") + if item_id not in calls_by_id: + call = { + "tool_id": item_id, + "tool_name": "web_search", + "parameters": {"query": q}, + "status": None, + "response": None, + } + tool_calls.append(call) + calls_by_id[item_id] = call + else: + call = calls_by_id[item_id] + call["parameters"] = {"query": q} + tool_uses[item_id] = { "tool_name": "web_search", - "parameters": {"query": payload.get("query", "")}, + "parameters": {"query": q}, } if event_type == self._EV_ITEM_COMPLETED: tool_results[item_id] = {"status": "success", "content": ""} + call["status"] = "success" + call["response"] = "" elif kind == "file_change": + changes = payload.get("changes", []) + if item_id not in calls_by_id: + call = { + "tool_id": item_id, + "tool_name": "file_change", + "parameters": {"changes": changes}, + "status": None, + "response": None, + } + tool_calls.append(call) + calls_by_id[item_id] = call + else: + call = calls_by_id[item_id] + call["parameters"] = {"changes": changes} + tool_uses[item_id] = { "tool_name": "file_change", - "parameters": {"changes": payload.get("changes", [])}, + "parameters": {"changes": changes}, } if event_type == self._EV_ITEM_COMPLETED: status = payload.get("status", "") is_error = status not in ("", "completed", "success", "ok") + tstatus = "error" if is_error else "success" tool_results[item_id] = { - "status": "error" if is_error else "success", + "status": tstatus, "content": "", } + call["status"] = tstatus + call["response"] = "" input_tokens = int(usage.get("input_tokens", 0) or 0) output_tokens = int(usage.get("output_tokens", 0) or 0) diff --git a/evalbench/generators/models/gemini_cli.py b/evalbench/generators/models/gemini_cli.py index a82e91f7..1c9a298b 100644 --- a/evalbench/generators/models/gemini_cli.py +++ b/evalbench/generators/models/gemini_cli.py @@ -878,7 +878,9 @@ def _run_gemini_cli(self, cli_cmd: CLICommand): def _parse_stream_json(self, stream_output: str) -> str: import dateutil.parser - final_obj = {"session_id": "", "response": "", "stats": {}} + final_obj = {"session_id": "", "response": "", "stats": {}, "tool_calls": []} + tool_calls = final_obj["tool_calls"] + calls_by_id = {} tool_uses = {} tool_results = {} model_name = "gemini-2.5-flash" @@ -899,10 +901,24 @@ def _parse_stream_json(self, stream_output: str) -> str: tool_id = event.get("tool_id") if tool_id: tool_uses[tool_id] = event + tname = canonicalize_gemini_tool_name(event.get("tool_name", "unknown")) + call = { + "tool_id": tool_id, + "tool_name": tname, + "parameters": event.get("parameters", {}), + "status": None, + "response": None, + } + tool_calls.append(call) + calls_by_id[tool_id] = call elif t == "tool_result": tool_id = event.get("tool_id") if tool_id: tool_results[tool_id] = event + if tool_id in calls_by_id: + call = calls_by_id[tool_id] + call["status"] = event.get("status") + call["response"] = event.get("result") or event.get("content") or event.get("output") elif t == "result": s = event.get("stats", {}) total_duration = s.get("duration_ms", 0) diff --git a/evalbench/generators/prompts/simulateduser.py b/evalbench/generators/prompts/simulateduser.py index b5ab4077..9ba8c5c8 100644 --- a/evalbench/generators/prompts/simulateduser.py +++ b/evalbench/generators/prompts/simulateduser.py @@ -27,15 +27,30 @@ def setup(self): pass def generate(self, item): + import json # item is the payload dictionary plan = item.get("conversation_plan", "") history_list = item.get("history", []) last_reply = item.get("last_agent_reply", "") + try: + parsed_last_reply = json.loads(last_reply) + if isinstance(parsed_last_reply, dict) and "response" in parsed_last_reply: + last_reply = parsed_last_reply["response"] + except Exception: + pass + # Format history history_str = "" for turn in history_list: - history_str += f"User: {turn['user']}\nAgent: {turn['agent']}\n" + agent_content = turn.get("agent", "") + try: + parsed_agent = json.loads(agent_content) + if isinstance(parsed_agent, dict) and "response" in parsed_agent: + agent_content = parsed_agent["response"] + except Exception: + pass + history_str += f"User: {turn.get('user', '')}\nAgent: {agent_content}\n" prompt = self.prompt_template.replace( "[[conversation_plan]]", str(plan)) diff --git a/evalbench/scorers/behavioralmetrics.py b/evalbench/scorers/behavioralmetrics.py index 604f6ed8..c424a0f4 100644 --- a/evalbench/scorers/behavioralmetrics.py +++ b/evalbench/scorers/behavioralmetrics.py @@ -3,6 +3,7 @@ from scorers import comparator from generators.models import get_generator from .prompt.behavioralmetrics import BEHAVIORAL_METRICS_PROMPT +from scorers.util import format_conversation_history import json import re @@ -19,6 +20,7 @@ def __init__(self, config: dict, global_models): if not self.model_config: raise ValueError("model_config is required for BehavioralMetrics") self.model = get_generator(global_models, self.model_config) + self.include_tool_calls = config.get("include_tool_calls", False) def compare( self, @@ -50,9 +52,13 @@ def compare( scenario = context.get("scenario", {}) conversation_plan = scenario.get("conversation_plan", "") + formatted_history = format_conversation_history( + conversation_history, include_tool_calls=self.include_tool_calls + ) + prompt = BEHAVIORAL_METRICS_PROMPT.format( conversation_plan=conversation_plan, - conversation_history=conversation_history + conversation_history=formatted_history ) try: @@ -64,12 +70,12 @@ def compare( clarifications = 0 hallucination_match = re.search( - r'Hallucination Count:\s*(\\d+)', response_text) + r'Hallucination Count:\s*(\d+)', response_text) if hallucination_match: hallucinations = int(hallucination_match.group(1)) clarification_match = re.search( - r'Clarification Count:\s*(\\d+)', response_text) + r'Clarification Count:\s*(\d+)', response_text) if clarification_match: clarifications = int(clarification_match.group(1)) @@ -83,3 +89,4 @@ def compare( except Exception as e: logging.error(f'BehavioralMetrics generation failed: {e}') return 0.0, f"Error calling model: {e}" + diff --git a/evalbench/scorers/binaryrubricscorer.py b/evalbench/scorers/binaryrubricscorer.py index bf2e1c58..bf40c6fd 100644 --- a/evalbench/scorers/binaryrubricscorer.py +++ b/evalbench/scorers/binaryrubricscorer.py @@ -3,6 +3,7 @@ from scorers import comparator from generators.models import get_generator from scorers.prompt.binaryrubricscorer import BINARY_RUBRIC_EVAL_PROMPT +from scorers.util import format_conversation_history import re import json @@ -21,6 +22,7 @@ def __init__(self, config: dict, global_models, if not self.model_config: raise ValueError("model_config is required for BinaryRubricScorer") self.model = get_generator(global_models, self.model_config) + self.include_tool_calls = config.get("include_tool_calls", False) def compare( self, @@ -61,9 +63,13 @@ def compare( "No rubric defined for this scenario. Defaulting to PASS." ) + formatted_history = format_conversation_history( + conversation_history, include_tool_calls=self.include_tool_calls + ) + prompt = BINARY_RUBRIC_EVAL_PROMPT.format( rubric_item=criterion_to_evaluate, - conversation_history=conversation_history + conversation_history=formatted_history ) try: @@ -85,3 +91,4 @@ def compare( except Exception as e: logging.error(f'BinaryRubricScorer generation failed: {e}') return 0.0, f"Error calling model: {e}" + diff --git a/evalbench/scorers/goalcompletionrate.py b/evalbench/scorers/goalcompletionrate.py index 89ce5d08..13397469 100644 --- a/evalbench/scorers/goalcompletionrate.py +++ b/evalbench/scorers/goalcompletionrate.py @@ -3,6 +3,7 @@ from scorers import comparator from generators.models import get_generator from .prompt.goalcompletion import GOAL_COMPLETION_PROMPT +from scorers.util import format_conversation_history import json @@ -17,6 +18,7 @@ def __init__(self, config: dict, global_models): if not self.model_config: raise ValueError("model_config is required for GoalCompletionRate") self.model = get_generator(global_models, self.model_config) + self.include_tool_calls = config.get("include_tool_calls", False) def compare( self, @@ -48,9 +50,13 @@ def compare( scenario = context.get("scenario", {}) conversation_plan = scenario.get("conversation_plan", "") + formatted_history = format_conversation_history( + conversation_history, include_tool_calls=self.include_tool_calls + ) + prompt = GOAL_COMPLETION_PROMPT.format( conversation_plan=conversation_plan, - conversation_history=conversation_history + conversation_history=formatted_history ) try: @@ -58,10 +64,11 @@ def compare( response_text = getattr( response, 'stdout', response) if response else "" if isinstance(response_text, str): - first_line = response_text.strip().split('\\n')[0].upper() + first_line = response_text.strip().split('\n')[0].upper() score = 100.0 if "PASS" in first_line else 0.0 return score, response_text return 0.0, "Failed to parse LLM evaluation response." except Exception as e: logging.error(f'GoalCompletionRate generation failed: {e}') return 0.0, f"Error calling model: {e}" + diff --git a/evalbench/scorers/util.py b/evalbench/scorers/util.py index 1d4d5a67..778223bf 100644 --- a/evalbench/scorers/util.py +++ b/evalbench/scorers/util.py @@ -61,3 +61,54 @@ def make_hashable(value): elif isinstance(value, dict): return frozenset((k, make_hashable(v)) for k, v in value.items()) return value + + +def format_conversation_history(history: Any, include_tool_calls: bool = False) -> str: + import json + + if not history: + return "[]" + + try: + history_list = ( + json.loads(history) if isinstance(history, str) else history + ) + except Exception: + return str(history) + + if not isinstance(history_list, list): + return str(history) + + history_str = "" + for turn in history_list: + user_msg = turn.get("user", "") + agent_raw = turn.get("agent", "") + + agent_content = agent_raw + tool_calls = [] + try: + parsed_agent = ( + json.loads(agent_raw) if isinstance(agent_raw, str) else agent_raw + ) + if isinstance(parsed_agent, dict): + agent_content = parsed_agent.get("response", "") + tool_calls = parsed_agent.get("tool_calls", []) + except Exception: + pass + + history_str += f"User: {user_msg}\n" + + if include_tool_calls and tool_calls: + for call in tool_calls: + name = call.get("tool_name") or call.get("name") or "unknown_tool" + args = call.get("parameters") or call.get("args") or {} + status = call.get("status") or "executed" + resp = call.get("response") or "" + history_str += ( + f"Agent invoked {name}({args}) -> {status.upper()}:\n {resp}\n" + ) + + history_str += f"Agent: {agent_content}\n" + + return history_str + diff --git a/evalbench/test/gemini_cli_test.py b/evalbench/test/gemini_cli_test.py index ce2552bf..5022ab5d 100644 --- a/evalbench/test/gemini_cli_test.py +++ b/evalbench/test/gemini_cli_test.py @@ -337,3 +337,38 @@ def test_setup_skill_dict_disable( "my-skill", ] assert expected_cmd in calls + + +def test_parse_stream_json(monkeypatch): + monkeypatch.setenv("HOME", "/fake/home") + with ( + patch('generators.models.gemini_cli.os.path.exists', return_value=False), + patch('generators.models.gemini_cli.os.makedirs'), + patch('generators.models.gemini_cli.open', create=True), + ): + generator = GeminiCliGenerator({}) + + stream_output = "\n".join([ + '{"type": "init", "session_id": "session_123", "model": "gemini-2.5-pro"}', + '{"type": "message", "role": "assistant", "content": "Hello "}', + '{"type": "tool_use", "tool_id": "call_abc", "tool_name": "mcp_myserver_mytool", "parameters": {"arg1": "val1"}}', + '{"type": "message", "role": "assistant", "content": "world!"}', + '{"type": "tool_result", "tool_id": "call_abc", "status": "success", "result": "tool-result-output"}', + '{"type": "result", "stats": {"duration_ms": 1500, "input_tokens": 10, "output_tokens": 20, "total_tokens": 30, "cached": 5}}' + ]) + + parsed_raw = generator._parse_stream_json(stream_output) + import json + parsed = json.loads(parsed_raw) + + assert parsed["session_id"] == "session_123" + assert parsed["response"] == "Hello world!" + assert len(parsed["tool_calls"]) == 1 + + call = parsed["tool_calls"][0] + assert call["tool_id"] == "call_abc" + assert call["tool_name"] == "myserver__mytool" + assert call["parameters"] == {"arg1": "val1"} + assert call["status"] == "success" + assert call["response"] == "tool-result-output" + diff --git a/evalbench/test/scorers_util_test.py b/evalbench/test/scorers_util_test.py new file mode 100644 index 00000000..b0005f58 --- /dev/null +++ b/evalbench/test/scorers_util_test.py @@ -0,0 +1,50 @@ +import unittest +from scorers.util import format_conversation_history + + +class TestScorersUtil(unittest.TestCase): + + def test_format_conversation_history_without_tool_calls(self): + history = [ + { + "user": "List Cloud SQL instances.", + "agent": '{"response": "Here is the list: instance-1", "tool_calls": [{"tool_name": "list_instances", "parameters": {}, "status": "success", "response": "instance-1"}]}' + } + ] + + formatted = format_conversation_history(history, include_tool_calls=False) + expected = "User: List Cloud SQL instances.\nAgent: Here is the list: instance-1\n" + self.assertEqual(formatted, expected) + + def test_format_conversation_history_with_tool_calls(self): + history = [ + { + "user": "List Cloud SQL instances.", + "agent": '{"response": "Here is the list: instance-1", "tool_calls": [{"tool_name": "list_instances", "parameters": {"proj": "p1"}, "status": "success", "response": "[instance-1]"}]}' + } + ] + + formatted = format_conversation_history(history, include_tool_calls=True) + expected = ( + "User: List Cloud SQL instances.\n" + "Agent invoked list_instances({'proj': 'p1'}) -> SUCCESS:\n" + " [instance-1]\n" + "Agent: Here is the list: instance-1\n" + ) + self.assertEqual(formatted, expected) + + def test_format_conversation_history_malformed_fallback(self): + history = [ + { + "user": "Hello", + "agent": "Plain text agent response" + } + ] + + formatted = format_conversation_history(history, include_tool_calls=True) + expected = "User: Hello\nAgent: Plain text agent response\n" + self.assertEqual(formatted, expected) + + +if __name__ == '__main__': + unittest.main() diff --git a/evalbench/test/simulateduser_test.py b/evalbench/test/simulateduser_test.py new file mode 100644 index 00000000..e1a3dd23 --- /dev/null +++ b/evalbench/test/simulateduser_test.py @@ -0,0 +1,34 @@ +import unittest +from generators.prompts.simulateduser import SimulatedUserPromptGenerator + + +class TestSimulatedUserPromptGenerator(unittest.TestCase): + + def test_generate_cleans_json_history_and_last_reply(self): + generator = SimulatedUserPromptGenerator(None, {}) + + history = [ + { + "user": "List instances", + "agent": '{\n "session_id": "session_1",\n "response": "Here is the list of instances: instance-1",\n "stats": {}\n}' + } + ] + last_reply = '{\n "session_id": "session_1",\n "response": "What else can I help with?",\n "stats": {}\n}' + + item = { + "conversation_plan": "Verify listing instances", + "history": history, + "last_agent_reply": last_reply + } + + result = generator.generate(item) + prompt = result["prompt"] + + self.assertIn("Agent: Here is the list of instances: instance-1", prompt) + self.assertNotIn("session_id", prompt) + self.assertNotIn("stats", prompt) + self.assertIn("# Last Agent Reply:\nWhat else can I help with?", prompt) + + +if __name__ == '__main__': + unittest.main() diff --git a/uv.lock b/uv.lock index 9dab9662..301fe7db 100644 --- a/uv.lock +++ b/uv.lock @@ -1800,7 +1800,7 @@ wheels = [ [[package]] name = "google-evalbench" -version = "1.7.1" +version = "1.8.0" source = { editable = "." } dependencies = [ { name = "a2a-sdk" },