Skip to content

Commit 041e04f

Browse files
author
Dylan Huang
committed
Merge branch 'main' into in-progress-eval-viewer
# Conflicts: # pyproject.toml
2 parents 290714f + d3c4007 commit 041e04f

File tree

6 files changed

+137
-4
lines changed

6 files changed

+137
-4
lines changed

eval_protocol/mcp/mcpgym.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from mcp.server.fastmcp import Context, FastMCP
2727
from starlette.requests import Request
2828
from starlette.responses import JSONResponse
29+
from uvicorn.middleware.proxy_headers import ProxyHeadersMiddleware
2930

3031
from .adapter import EnvironmentAdapter
3132

@@ -562,11 +563,18 @@ def run(self, transport: str = "streamable-http", **kwargs):
562563
async def run_with_high_concurrency():
563564
starlette_app = self.mcp.streamable_http_app()
564565

566+
if not kwargs.get("redirect_slashes", True) and hasattr(starlette_app, "router"):
567+
starlette_app.router.redirect_slashes = False
568+
569+
starlette_app.add_middleware(ProxyHeadersMiddleware, trusted_hosts="*")
570+
565571
config = uvicorn.Config(
566572
starlette_app,
567573
host=self.mcp.settings.host,
568574
port=self.mcp.settings.port,
569575
log_level=self.mcp.settings.log_level.lower(),
576+
proxy_headers=True,
577+
forwarded_allow_ips="*",
570578
# HIGH CONCURRENCY SETTINGS
571579
limit_concurrency=200, # Increase for HTTP endpoints + MCP
572580
limit_max_requests=100000, # Higher request limit

eval_protocol/mcp_env.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -104,17 +104,17 @@ def make(
104104
if evaluation_rows:
105105
for i, row in enumerate(evaluation_rows):
106106
dataset_info = row.input_metadata.dataset_info if row.input_metadata else {}
107-
107+
108108
system_message = row.get_system_message()
109109
system_prompt = system_message.content or ""
110-
110+
111111
dataset_entry = {
112112
"id": row.input_metadata.row_id if row.input_metadata and row.input_metadata.row_id else f"task_{i}",
113113
"system_prompt": system_prompt,
114114
"user_prompt_template": dataset_info.get("user_prompt_template", ""),
115115
"environment_context": dataset_info.get("environment_context", {}),
116116
"user_simulation": dataset_info.get("user_simulation", {}),
117-
"evaluation_criteria": dataset_info.get("evaluation_criteria", {})
117+
"evaluation_criteria": dataset_info.get("evaluation_criteria", {}),
118118
}
119119
internal_dataset.append(dataset_entry)
120120
elif dataset:

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ dependencies = [
4949
"pandas>=1.5.0",
5050
"watchdog>=2.1.0",
5151
"websockets>=15.0.1",
52+
"fireworks-ai>=0.19.12",
5253
]
5354

5455
[project.urls]
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{"knowledge": " It is a hygroscopic solid that is highly soluble in water and slightly soluble in alcohol.Ethanol, also called alcohol, ethyl alcohol, and drinking alcohol, is a compound and simple alcohol with the chemical formula C2H5OH .", "question": "Cadmium Chloride is slightly soluble in this chemical, it is also called what?", "right_answer": "alcohol", "hallucinated_answer": "water with a hint of alcohol"}
2+
{"knowledge": "The Great Outdoors is a 1988 American comedy film directed by Howard Deutch, and written and produced by John Hughes. It stars Dan Aykroyd, John Candy, Stephanie Faracy and Annette Bening in her film debut.Annette Carol Bening (born May 29, 1958) is an American actress. She is a four-time Academy Award nominee; for \"The Grifters\" (1990), \"American Beauty\" (1999), \"Being Julia\" (2004) and \"The Kids Are All Right\" (2010). In 2006, she received a star on the Hollywood Walk of Fame.", "question": "The 1988 American comedy film, The Great Outdoors, starred a four-time Academy Award nominee, who received a star on the Hollywood Walk of Fame in what year?", "right_answer": "2006", "hallucinated_answer": "Annette Bening received her Hollywood star in 1988."}
3+
{"knowledge": " Her self-titled debut studio album was released on 2 June 2017.\"New Rules\" is a song by English singer Dua Lipa from her eponymous debut studio album (2017).", "question": "Dua Lipa, an English singer, songwriter and model, the album spawned the number-one single \"New Rules\" is a song by English singer Dua Lipa from her eponymous debut studio album, released in what year?", "right_answer": "2017", "hallucinated_answer": "The album was released in 2018."}

tests/pytest/test_hallucination.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
"""
2+
Hallucination detection test using LLM-as-judge.
3+
4+
This test demonstrates how to detect factual inaccuracies in model responses
5+
by comparing them against provided knowledge using an LLM judge, similar to
6+
tau's evaluate_nl_assertions approach.
7+
"""
8+
9+
import json
10+
from typing import Any, Dict, List
11+
12+
from fireworks import LLM
13+
14+
from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
15+
from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
16+
17+
judge_llm = LLM(model="accounts/fireworks/models/kimi-k2-instruct", deployment_type="serverless")
18+
19+
20+
def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
21+
"""Convert HaluEval dataset to EvaluationRow objects."""
22+
return [
23+
EvaluationRow(
24+
messages=[Message(role="user", content=f"Knowledge: {item['knowledge']}\n\nQuestion: {item['question']}")],
25+
ground_truth=item["right_answer"]
26+
)
27+
for item in data
28+
]
29+
30+
31+
@evaluation_test(
32+
input_dataset=["tests/pytest/data/halueval_sample_dataset.jsonl"],
33+
dataset_adapter=hallucination_dataset_adapter,
34+
model=["accounts/fireworks/models/kimi-k2-instruct"],
35+
rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}],
36+
rollout_processor=default_single_turn_rollout_processor,
37+
threshold_of_success=1.0,
38+
num_runs=1,
39+
mode="pointwise",
40+
)
41+
def test_hallucination_detection(row: EvaluationRow) -> EvaluationRow:
42+
"""
43+
Test for response correctness using LLM-as-judge.
44+
"""
45+
messages = row.messages
46+
assistant_response = messages[-1].content
47+
48+
if not assistant_response:
49+
return EvaluateResult(score=0.0, reason="❌ No assistant response found")
50+
51+
correct_answer = row.ground_truth
52+
53+
system_prompt = """
54+
TASK
55+
- You will be given an assistant's response and the correct answer.
56+
- Your job is to evaluate whether the assistant's response is factually consistent with the correct answer.
57+
- Grade whether the assistant got it right or wrong.
58+
59+
FORMAT
60+
- Your response should be a JSON object with the following fields:
61+
- `reasoning`: a short explanation for your classification
62+
- `is_correct`: `true` if the assistant's response matches the correct answer, `false` otherwise
63+
64+
Example response structure:
65+
{
66+
"reasoning": "<reasoning trace>",
67+
"is_correct": <true or false>
68+
}
69+
"""
70+
71+
user_prompt = f"""
72+
assistant_response:
73+
{assistant_response}
74+
75+
correct_answer:
76+
{correct_answer}
77+
"""
78+
79+
try:
80+
response = judge_llm.chat.completions.create(
81+
messages=[
82+
{"role": "system", "content": system_prompt},
83+
{"role": "user", "content": user_prompt}
84+
],
85+
temperature=0.1,
86+
max_tokens=500,
87+
)
88+
89+
result_data = json.loads(response.choices[0].message.content)
90+
is_correct = result_data.get("is_correct", False)
91+
reasoning = result_data.get("reasoning", "Could not parse reasoning")
92+
93+
except Exception as e:
94+
# Fallback if parsing fails
95+
is_correct = False
96+
reasoning = f"Evaluation failed: {str(e)}"
97+
98+
score = 1.0 if is_correct else 0.0
99+
100+
if is_correct:
101+
assessment = "✅ Response is correct"
102+
else:
103+
assessment = "❌ Response is incorrect"
104+
105+
reason = f"{assessment}\nReasoning: {reasoning}"
106+
107+
row.evaluation_result = EvaluateResult(
108+
score=score,
109+
reason=reason,
110+
metrics={
111+
"llm_judge": MetricResult(
112+
score=score,
113+
reason=reasoning,
114+
is_score_valid=True
115+
)
116+
}
117+
)
118+
119+
return row

uv.lock

Lines changed: 3 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)