Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tests/pytest/datasets/gmail_inbox.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information." }, { "role": "user", "content": "Find the first 5 emails title in my inbox." } ], "ground_truth": "The first 5 emails contain meeting between Benny and Zheng"}
8 changes: 4 additions & 4 deletions tests/pytest/mcp_configurations/klavis_strata_mcp.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"mcpServers": {
"klavis-strata": {
"url": "https://strata.klavis.ai/mcp/",
"authorization": "Bearer ${KLAVIS_API_KEY}"
}
"klavis-strata": {
"url": "https://strata.klavis.ai/mcp/",
"authorization": "Bearer ${KLAVIS_API_KEY}"
}
}
}
69 changes: 41 additions & 28 deletions tests/pytest/test_pytest_klavis_mcp.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,54 @@
from eval_protocol.models import EvaluateResult, EvaluationRow, Message
from eval_protocol.pytest import AgentRolloutProcessor, evaluation_test
from openai import AsyncOpenAI
import json
from pydantic import BaseModel
import logging

logger = logging.getLogger(__name__)
import os


class ResponseFormat(BaseModel):
score: float


@evaluation_test(
input_messages=[
[
[
Message(
role="system",
content=(
"You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information.\n"
),
),
Message(
role="user",
content=("Find the first 5 emails title in my inbox."),
),
]
]
],
input_dataset=["tests/pytest/datasets/gmail_inbox.jsonl"],
rollout_processor=AgentRolloutProcessor(),
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}],
mode="pointwise",
mcp_config_path="tests/pytest/mcp_configurations/klavis_strata_mcp.json",
)
def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow:
# filter for all tool calls
tool_calls = [msg for msg in row.messages if msg.role == "tool"]
if len(tool_calls) == 0:
async def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow:
ground_truth = row.ground_truth
# check if the final messages contains the ground truth

async with AsyncOpenAI(
api_key=os.environ["FIREWORKS_API_KEY"], base_url="https://api.fireworks.ai/inference/v1"
) as client:
response = await client.chat.completions.create(
model="accounts/fireworks/models/kimi-k2-instruct-0905",
messages=[
{
"role": "system",
"content": "You are judging the output of the model versus the ground truth. Return score = 1 if the output contains the ground truth, 0 otherwise.",
},
{
"role": "user",
"content": "Final model output: {row.messages[-1].content}\nGround truth: {ground_truth}",
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Missing Prefix Causes Placeholder Interpolation Failure

The user message's content string is missing the f prefix, so the {row.messages[-1].content} and {ground_truth} placeholders are not interpolated. This sends literal text to the API, preventing the LLM judge from receiving the correct data for evaluation.

Fix in Cursor Fix in Web

},
Comment thread
benjibc marked this conversation as resolved.
],
response_format={
"type": "json_schema",
"json_schema": {"name": "ResponseFormat", "schema": ResponseFormat.model_json_schema()},
},
)
response_text = response.choices[0].message.content
logger.info("response_text: %s", response_text)
score = json.loads(response_text or "{}")["score"]
row.evaluation_result = EvaluateResult(
score=0,
reason="No tool calls made",
score=score,
reason=response_text,
)
return row

row.evaluation_result = EvaluateResult(
score=1,
reason="At least one tool call was made",
)
return row
Loading