Skip to content

Commit e9adadc

Browse files
author
Dylan Huang
committed
test reproduces error properly
1 parent b0f92ac commit e9adadc

3 files changed

Lines changed: 71 additions & 4 deletions

File tree

eval_protocol/mcp/mcp_multi_client.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,10 +69,7 @@ async def connect_to_servers(self):
6969
return
7070

7171
for server_name, server_config in self.config.mcpServers.items():
72-
try:
73-
await self._connect_to_server(server_name, server_config)
74-
except Exception as e:
75-
print(f"Failed to connect to server '{server_name}': {e}")
72+
await self._connect_to_server(server_name, server_config)
7673

7774
async def _connect_to_server(
7875
self, server_name: str, server_config: Union[MCPConfigurationServerStdio, MCPConfigurationServerUrl]
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"mcpServers": {
3+
"docs.fireworks.ai": {
4+
"url": "https://docs.fireworks.ai/mcp-non-existent"
5+
}
6+
}
7+
}
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
from typing import Set
2+
from eval_protocol.models import EvaluationRow, Message
3+
from eval_protocol.pytest.default_agent_rollout_processor import AgentRolloutProcessor
4+
from eval_protocol.dataset_logger import DatasetLogger
5+
6+
7+
class TrackingLogger(DatasetLogger):
8+
"""Custom logger that ensures that the final row is in an error state."""
9+
10+
def __init__(self, rollouts: dict[str, EvaluationRow]):
11+
self.rollouts = rollouts
12+
13+
def log(self, row: EvaluationRow):
14+
self.rollouts[row.execution_metadata.rollout_id] = row
15+
16+
def read(self):
17+
return []
18+
19+
20+
async def test_pytest_propagate_error():
21+
"""
22+
Properly propagate errors from rollout processing to eval_metadata.status.
23+
To test this, we use a broken MCP configuration that should fail during the
24+
rollout processing. Then the final eval_metadata.status should be an error.
25+
This way the UI can properly render an error state for the rollout and a
26+
developer can identify and investigate the error.
27+
"""
28+
from eval_protocol.pytest.evaluation_test import evaluation_test
29+
30+
input_messages = [
31+
[
32+
Message(
33+
role="system",
34+
content="You are a helpful assistant that can answer questions about Fireworks.",
35+
),
36+
]
37+
]
38+
completion_params_list = [
39+
{"model": "dummy/local-model"},
40+
]
41+
42+
rollouts: dict[str, EvaluationRow] = {}
43+
logger = TrackingLogger(rollouts)
44+
45+
@evaluation_test(
46+
input_messages=input_messages,
47+
completion_params=completion_params_list,
48+
rollout_processor=AgentRolloutProcessor(),
49+
mode="pointwise",
50+
num_runs=5,
51+
mcp_config_path="tests/pytest/mcp_configurations/docs_mcp_config_broken.json",
52+
logger=logger,
53+
)
54+
def eval_fn(row: EvaluationRow) -> EvaluationRow:
55+
return row
56+
57+
# Manually invoke all parameter combinations within a single test
58+
for params in completion_params_list:
59+
await eval_fn(input_messages=input_messages, completion_params=params)
60+
61+
# assert that the status of eval_metadata.status is "error"
62+
assert len(rollouts) == 5
63+
assert all(row.eval_metadata.status == "error" for row in rollouts.values())

0 commit comments

Comments
 (0)