Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion eval_protocol/benchmarks/test_frozen_lake.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation
num_runs=1,
max_concurrent_rollouts=3,
mode="pointwise",
server_script_path="examples/frozen_lake_mcp/server.py",
server_script_path="eval_protocol/mcp_servers/frozen_lake/server.py",
)
def test_frozen_lake_evaluation(row: EvaluationRow) -> EvaluationRow:
"""
Expand Down
59 changes: 40 additions & 19 deletions eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,17 +436,23 @@ async def _execute_pointwise_eval_with_semaphore(
processed_row=row,
evaluation_test_kwargs=evaluation_test_kwargs,
)
except AssertionError:
raise
Comment thread
xzrderek marked this conversation as resolved.
Outdated
except Exception as e:
result = row
result.evaluation_result = EvaluateResult(
score=0.0,
is_score_valid=False,
reason=f"Error during evaluation: {type(e).__name__}: {e}",
)
if result.eval_metadata is not None:
result.eval_metadata.status = Status.error(
f"Error during evaluation: {type(e).__name__}: {e}",
# Default: capture non-assert exceptions unless explicitly disabled
Comment thread
xzrderek marked this conversation as resolved.
Outdated
if os.getenv("EP_CAPTURE_EVAL_EXCEPTIONS", "1").strip() == "1":
result = row
result.evaluation_result = EvaluateResult(
score=0.0,
is_score_valid=False,
reason=f"Error during evaluation: {type(e).__name__}: {e}",
)
if result.eval_metadata is not None:
result.eval_metadata.status = Status.error(
f"Error during evaluation: {type(e).__name__}: {e}",
)
else:
raise
Comment thread
xzrderek marked this conversation as resolved.
Outdated
if not isinstance(result, EvaluationRow):
raise ValueError(
f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
Expand Down Expand Up @@ -474,18 +480,24 @@ async def _execute_groupwise_eval_with_semaphore(
processed_dataset=rows,
evaluation_test_kwargs=evaluation_test_kwargs,
)
except AssertionError:
raise
Comment thread
xzrderek marked this conversation as resolved.
Outdated
except Exception as e:
results = rows
for row in results:
row.evaluation_result = EvaluateResult(
score=0.0,
is_score_valid=False,
reason=f"Error during evaluation: {type(e).__name__}: {e}",
)
if row.eval_metadata is not None:
row.eval_metadata.status = Status.error(
f"Error during evaluation: {type(e).__name__}: {e}",
# Default: capture non-assert exceptions unless explicitly disabled
Comment thread
xzrderek marked this conversation as resolved.
Outdated
if os.getenv("EP_CAPTURE_EVAL_EXCEPTIONS", "1").strip() == "1":
results = rows
for row in results:
row.evaluation_result = EvaluateResult(
score=0.0,
is_score_valid=False,
reason=f"Error during evaluation: {type(e).__name__}: {e}",
)
if row.eval_metadata is not None:
row.eval_metadata.status = Status.error(
f"Error during evaluation: {type(e).__name__}: {e}",
)
else:
raise
if not isinstance(results, list):
raise ValueError(
f"Test function {test_func.__name__} did not return a list of EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
Expand Down Expand Up @@ -704,11 +716,20 @@ async def _collect_result(config, lst):
)
pytest_wrapper = pytest.mark.asyncio(pytest_wrapper)

ep_params: dict[str, Any] = {
"rollout_processor": rollout_processor,
"server_script_path": server_script_path,
"mcp_config_path": mcp_config_path,
"rollout_processor_kwargs": rollout_processor_kwargs,
"mode": mode,
}

# Create the dual mode wrapper
dual_mode_wrapper = create_dual_mode_wrapper(
test_func, mode, max_concurrent_rollouts, max_concurrent_evaluations, pytest_wrapper
)

setattr(dual_mode_wrapper, "__ep_params__", ep_params)
return dual_mode_wrapper # pyright: ignore[reportReturnType, reportUnknownVariableType]

return decorator