Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 20 additions & 3 deletions evalbench/generators/models/claude_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,9 @@ def _parse_stream_json(self, stream_output: str) -> str:
"""Parses Claude Code stream-json output into a normalized format
compatible with the eval pipeline."""

final_obj = {"session_id": "", "response": "", "stats": {}}
final_obj = {"session_id": "", "response": "", "stats": {}, "tool_calls": []}
tool_calls = final_obj["tool_calls"]
calls_by_id = {}
tool_uses = {}
tool_results = {}
# Fall back to the configured model if the stream's `system` init
Expand Down Expand Up @@ -516,18 +518,33 @@ def _parse_stream_json(self, stream_output: str) -> str:
# trajectory matcher can compare across
# harnesses without per-generator logic.
raw_name = block.get("name", "unknown")
tname = canonicalize_claude_tool_name(raw_name)
tool_uses[tool_id] = {
"tool_name": canonicalize_claude_tool_name(raw_name),
"tool_name": tname,
"parameters": block.get("input", {}),
}
call = {
"tool_id": tool_id,
"tool_name": tname,
"parameters": block.get("input", {}),
"status": None,
"response": None,
}
tool_calls.append(call)
calls_by_id[tool_id] = call

elif event_type == "tool_result":
tool_id = event.get("tool_use_id") or event.get("id", "")
is_error = event.get("is_error", False)
status = "error" if is_error else "success"
tool_results[tool_id] = {
"status": "error" if is_error else "success",
"status": status,
"content": event.get("content", ""),
}
if tool_id in calls_by_id:
call = calls_by_id[tool_id]
call["status"] = status
call["response"] = event.get("content", "")

elif event_type == "result":
if "session_id" in event:
Expand Down
90 changes: 82 additions & 8 deletions evalbench/generators/models/codex_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -781,7 +781,9 @@ def _parse_stream_json(
"""
tool_durations = tool_durations or {}

final_obj = {"session_id": "", "response": "", "stats": {}}
final_obj = {"session_id": "", "response": "", "stats": {}, "tool_calls": []}
tool_calls = final_obj["tool_calls"]
calls_by_id = {}
tool_uses: dict[str, dict] = {}
tool_results: dict[str, dict] = {}
usage: dict = {}
Expand Down Expand Up @@ -853,8 +855,24 @@ def item_payload(item: dict) -> dict:
# can compare across harnesses without per-generator logic.
server = payload.get("server", "")
tool = payload.get("tool", "unknown")
tname = canonical_tool_name(server, tool)

if item_id not in calls_by_id:
call = {
"tool_id": item_id,
"tool_name": tname,
"parameters": self._coerce_json(payload.get("arguments", {})),
"status": None,
"response": None,
}
tool_calls.append(call)
calls_by_id[item_id] = call
else:
call = calls_by_id[item_id]
call["parameters"] = self._coerce_json(payload.get("arguments", {}))

tool_uses[item_id] = {
"tool_name": canonical_tool_name(server, tool),
"tool_name": tname,
"server": server,
"parameters": self._coerce_json(payload.get("arguments", {})),
}
Expand All @@ -863,44 +881,100 @@ def item_payload(item: dict) -> dict:
is_error = bool(payload.get("error")) or status not in (
"", "completed", "success", "ok",
)
tstatus = "error" if is_error else "success"
tool_results[item_id] = {
"status": "error" if is_error else "success",
"status": tstatus,
"content": payload.get("result", ""),
}
call["status"] = tstatus
call["response"] = payload.get("result", "")

elif kind == "command_execution":
cmd = payload.get("command", "")
if item_id not in calls_by_id:
call = {
"tool_id": item_id,
"tool_name": "shell",
"parameters": {"command": cmd},
"status": None,
"response": None,
}
tool_calls.append(call)
calls_by_id[item_id] = call
else:
call = calls_by_id[item_id]
call["parameters"] = {"command": cmd}

tool_uses[item_id] = {
"tool_name": "shell",
"parameters": {"command": payload.get("command", "")},
"parameters": {"command": cmd},
}
if event_type == self._EV_ITEM_COMPLETED:
exit_code = payload.get("exit_code")
is_error = bool(exit_code) and exit_code != 0
tstatus = "error" if is_error else "success"
tool_results[item_id] = {
"status": "error" if is_error else "success",
"status": tstatus,
"content": payload.get("aggregated_output", ""),
}
call["status"] = tstatus
call["response"] = payload.get("aggregated_output", "")

elif kind == "web_search":
q = payload.get("query", "")
if item_id not in calls_by_id:
call = {
"tool_id": item_id,
"tool_name": "web_search",
"parameters": {"query": q},
"status": None,
"response": None,
}
tool_calls.append(call)
calls_by_id[item_id] = call
else:
call = calls_by_id[item_id]
call["parameters"] = {"query": q}

tool_uses[item_id] = {
"tool_name": "web_search",
"parameters": {"query": payload.get("query", "")},
"parameters": {"query": q},
}
if event_type == self._EV_ITEM_COMPLETED:
tool_results[item_id] = {"status": "success", "content": ""}
call["status"] = "success"
call["response"] = ""

elif kind == "file_change":
changes = payload.get("changes", [])
if item_id not in calls_by_id:
call = {
"tool_id": item_id,
"tool_name": "file_change",
"parameters": {"changes": changes},
"status": None,
"response": None,
}
tool_calls.append(call)
calls_by_id[item_id] = call
else:
call = calls_by_id[item_id]
call["parameters"] = {"changes": changes}

tool_uses[item_id] = {
"tool_name": "file_change",
"parameters": {"changes": payload.get("changes", [])},
"parameters": {"changes": changes},
}
if event_type == self._EV_ITEM_COMPLETED:
status = payload.get("status", "")
is_error = status not in ("", "completed", "success", "ok")
tstatus = "error" if is_error else "success"
tool_results[item_id] = {
"status": "error" if is_error else "success",
"status": tstatus,
"content": "",
}
call["status"] = tstatus
call["response"] = ""

input_tokens = int(usage.get("input_tokens", 0) or 0)
output_tokens = int(usage.get("output_tokens", 0) or 0)
Expand Down
18 changes: 17 additions & 1 deletion evalbench/generators/models/gemini_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -878,7 +878,9 @@ def _run_gemini_cli(self, cli_cmd: CLICommand):
def _parse_stream_json(self, stream_output: str) -> str:
import dateutil.parser

final_obj = {"session_id": "", "response": "", "stats": {}}
final_obj = {"session_id": "", "response": "", "stats": {}, "tool_calls": []}
tool_calls = final_obj["tool_calls"]
calls_by_id = {}
tool_uses = {}
tool_results = {}
model_name = "gemini-2.5-flash"
Expand All @@ -899,10 +901,24 @@ def _parse_stream_json(self, stream_output: str) -> str:
tool_id = event.get("tool_id")
if tool_id:
tool_uses[tool_id] = event
tname = canonicalize_gemini_tool_name(event.get("tool_name", "unknown"))
call = {
"tool_id": tool_id,
"tool_name": tname,
"parameters": event.get("parameters", {}),
"status": None,
"response": None,
}
tool_calls.append(call)
calls_by_id[tool_id] = call
elif t == "tool_result":
tool_id = event.get("tool_id")
if tool_id:
tool_results[tool_id] = event
if tool_id in calls_by_id:
call = calls_by_id[tool_id]
call["status"] = event.get("status")
call["response"] = event.get("result") or event.get("content") or event.get("output")
elif t == "result":
s = event.get("stats", {})
total_duration = s.get("duration_ms", 0)
Expand Down
17 changes: 16 additions & 1 deletion evalbench/generators/prompts/simulateduser.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,30 @@ def setup(self):
pass

def generate(self, item):
import json
# item is the payload dictionary
plan = item.get("conversation_plan", "")
history_list = item.get("history", [])
last_reply = item.get("last_agent_reply", "")

try:
parsed_last_reply = json.loads(last_reply)
if isinstance(parsed_last_reply, dict) and "response" in parsed_last_reply:
last_reply = parsed_last_reply["response"]
except Exception:
pass

# Format history
history_str = ""
for turn in history_list:
history_str += f"User: {turn['user']}\nAgent: {turn['agent']}\n"
agent_content = turn.get("agent", "")
try:
parsed_agent = json.loads(agent_content)
if isinstance(parsed_agent, dict) and "response" in parsed_agent:
agent_content = parsed_agent["response"]
except Exception:
pass
history_str += f"User: {turn.get('user', '')}\nAgent: {agent_content}\n"

prompt = self.prompt_template.replace(
"[[conversation_plan]]", str(plan))
Expand Down
13 changes: 10 additions & 3 deletions evalbench/scorers/behavioralmetrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from scorers import comparator
from generators.models import get_generator
from .prompt.behavioralmetrics import BEHAVIORAL_METRICS_PROMPT
from scorers.util import format_conversation_history
import json
import re

Expand All @@ -19,6 +20,7 @@ def __init__(self, config: dict, global_models):
if not self.model_config:
raise ValueError("model_config is required for BehavioralMetrics")
self.model = get_generator(global_models, self.model_config)
self.include_tool_calls = config.get("include_tool_calls", False)

def compare(
self,
Expand Down Expand Up @@ -50,9 +52,13 @@ def compare(
scenario = context.get("scenario", {})
conversation_plan = scenario.get("conversation_plan", "")

formatted_history = format_conversation_history(
conversation_history, include_tool_calls=self.include_tool_calls
)

prompt = BEHAVIORAL_METRICS_PROMPT.format(
conversation_plan=conversation_plan,
conversation_history=conversation_history
conversation_history=formatted_history
)

try:
Expand All @@ -64,12 +70,12 @@ def compare(
clarifications = 0

hallucination_match = re.search(
r'Hallucination Count:\s*(\\d+)', response_text)
r'Hallucination Count:\s*(\d+)', response_text)
if hallucination_match:
hallucinations = int(hallucination_match.group(1))

clarification_match = re.search(
r'Clarification Count:\s*(\\d+)', response_text)
r'Clarification Count:\s*(\d+)', response_text)
if clarification_match:
clarifications = int(clarification_match.group(1))

Expand All @@ -83,3 +89,4 @@ def compare(
except Exception as e:
logging.error(f'BehavioralMetrics generation failed: {e}')
return 0.0, f"Error calling model: {e}"

9 changes: 8 additions & 1 deletion evalbench/scorers/binaryrubricscorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from scorers import comparator
from generators.models import get_generator
from scorers.prompt.binaryrubricscorer import BINARY_RUBRIC_EVAL_PROMPT
from scorers.util import format_conversation_history
import re
import json

Expand All @@ -21,6 +22,7 @@ def __init__(self, config: dict, global_models,
if not self.model_config:
raise ValueError("model_config is required for BinaryRubricScorer")
self.model = get_generator(global_models, self.model_config)
self.include_tool_calls = config.get("include_tool_calls", False)

def compare(
self,
Expand Down Expand Up @@ -61,9 +63,13 @@ def compare(
"No rubric defined for this scenario. Defaulting to PASS."
)

formatted_history = format_conversation_history(
conversation_history, include_tool_calls=self.include_tool_calls
)

prompt = BINARY_RUBRIC_EVAL_PROMPT.format(
rubric_item=criterion_to_evaluate,
conversation_history=conversation_history
conversation_history=formatted_history
)

try:
Expand All @@ -85,3 +91,4 @@ def compare(
except Exception as e:
logging.error(f'BinaryRubricScorer generation failed: {e}')
return 0.0, f"Error calling model: {e}"

11 changes: 9 additions & 2 deletions evalbench/scorers/goalcompletionrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from scorers import comparator
from generators.models import get_generator
from .prompt.goalcompletion import GOAL_COMPLETION_PROMPT
from scorers.util import format_conversation_history
import json


Expand All @@ -17,6 +18,7 @@ def __init__(self, config: dict, global_models):
if not self.model_config:
raise ValueError("model_config is required for GoalCompletionRate")
self.model = get_generator(global_models, self.model_config)
self.include_tool_calls = config.get("include_tool_calls", False)

def compare(
self,
Expand Down Expand Up @@ -48,20 +50,25 @@ def compare(
scenario = context.get("scenario", {})
conversation_plan = scenario.get("conversation_plan", "")

formatted_history = format_conversation_history(
conversation_history, include_tool_calls=self.include_tool_calls
)

prompt = GOAL_COMPLETION_PROMPT.format(
conversation_plan=conversation_plan,
conversation_history=conversation_history
conversation_history=formatted_history
)

try:
response = self.model.generate(prompt)
response_text = getattr(
response, 'stdout', response) if response else ""
if isinstance(response_text, str):
first_line = response_text.strip().split('\\n')[0].upper()
first_line = response_text.strip().split('\n')[0].upper()
score = 100.0 if "PASS" in first_line else 0.0
return score, response_text
return 0.0, "Failed to parse LLM evaluation response."
except Exception as e:
logging.error(f'GoalCompletionRate generation failed: {e}')
return 0.0, f"Error calling model: {e}"

Loading
Loading