eval-protocol · benjibc · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025 · cursor
diff --git a/tests/pytest/datasets/gmail_inbox.jsonl b/tests/pytest/datasets/gmail_inbox.jsonl
@@ -0,0 +1 @@
+{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information." }, { "role": "user", "content": "Find the first 5 emails title in my inbox." } ], "ground_truth": "The first 5 emails contain meeting between Benny and Zheng"}
diff --git a/tests/pytest/mcp_configurations/klavis_strata_mcp.json b/tests/pytest/mcp_configurations/klavis_strata_mcp.json
@@ -1,8 +1,8 @@
 {
   "mcpServers": {
-      "klavis-strata": {
-        "url": "https://strata.klavis.ai/mcp/",
-        "authorization": "Bearer ${KLAVIS_API_KEY}"
-      }
+    "klavis-strata": {
+      "url": "https://strata.klavis.ai/mcp/",
+      "authorization": "Bearer ${KLAVIS_API_KEY}"
     }
+  }
 }
diff --git a/tests/pytest/test_pytest_klavis_mcp.py b/tests/pytest/test_pytest_klavis_mcp.py
@@ -1,41 +1,54 @@
 from eval_protocol.models import EvaluateResult, EvaluationRow, Message
 from eval_protocol.pytest import AgentRolloutProcessor, evaluation_test
+from openai import AsyncOpenAI
+import json
+from pydantic import BaseModel
+import logging
+
+logger = logging.getLogger(__name__)
+import os
+
+
+class ResponseFormat(BaseModel):
+    score: float
 
 
 @evaluation_test(
-    input_messages=[
-        [
-            [
-                Message(
-                    role="system",
-                    content=(
-                        "You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information.\n"
-                    ),
-                ),
-                Message(
-                    role="user",
-                    content=("Find the first 5 emails title in my inbox."),
-                ),
-            ]
-        ]
-    ],
+    input_dataset=["tests/pytest/datasets/gmail_inbox.jsonl"],
     rollout_processor=AgentRolloutProcessor(),
     completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}],
     mode="pointwise",
     mcp_config_path="tests/pytest/mcp_configurations/klavis_strata_mcp.json",
 )
-def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow:
-    # filter for all tool calls
-    tool_calls = [msg for msg in row.messages if msg.role == "tool"]
-    if len(tool_calls) == 0:
+async def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow:
+    ground_truth = row.ground_truth
+    # check if the final messages contains the ground truth
+
+    async with AsyncOpenAI(
+        api_key=os.environ["FIREWORKS_API_KEY"], base_url="https://api.fireworks.ai/inference/v1"
+    ) as client:
+        response = await client.chat.completions.create(
+            model="accounts/fireworks/models/kimi-k2-instruct-0905",
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are judging the output of the model versus the ground truth. Return score = 1 if the output contains the ground truth, 0 otherwise.",
+                },
+                {
+                    "role": "user",
+                    "content": "Final model output: {row.messages[-1].content}\nGround truth: {ground_truth}",
+                },
+            ],
+            response_format={
+                "type": "json_schema",
+                "json_schema": {"name": "ResponseFormat", "schema": ResponseFormat.model_json_schema()},
+            },
+        )
+        response_text = response.choices[0].message.content
+        logger.info("response_text: %s", response_text)
+        score = json.loads(response_text or "{}")["score"]
         row.evaluation_result = EvaluateResult(
-            score=0,
-            reason="No tool calls made",
+            score=score,
+            reason=response_text,
         )
-        return row
-
-    row.evaluation_result = EvaluateResult(
-        score=1,
-        reason="At least one tool call was made",
-    )
     return row
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information." }, { "role": "user", "content": "Find the first 5 emails title in my inbox." } ], "ground_truth": "The first 5 emails contain meeting between Benny and Zheng"}