Merge branch 'main' into in-progress-eval-viewer

Dylan Huang · Dylan Huang · commit 041e04fa3c92 · 2025-08-05T11:07:42.000-07:00
# Conflicts:
#	pyproject.toml
diff --git a/eval_protocol/mcp/mcpgym.py b/eval_protocol/mcp/mcpgym.py
@@ -26,6 +26,7 @@
 from mcp.server.fastmcp import Context, FastMCP
 from starlette.requests import Request
 from starlette.responses import JSONResponse
+from uvicorn.middleware.proxy_headers import ProxyHeadersMiddleware
 
 from .adapter import EnvironmentAdapter
 
@@ -562,11 +563,18 @@ def run(self, transport: str = "streamable-http", **kwargs):
             async def run_with_high_concurrency():
                 starlette_app = self.mcp.streamable_http_app()
 
+                if not kwargs.get("redirect_slashes", True) and hasattr(starlette_app, "router"):
+                    starlette_app.router.redirect_slashes = False
+
+                starlette_app.add_middleware(ProxyHeadersMiddleware, trusted_hosts="*")
+
                 config = uvicorn.Config(
                     starlette_app,
                     host=self.mcp.settings.host,
                     port=self.mcp.settings.port,
                     log_level=self.mcp.settings.log_level.lower(),
+                    proxy_headers=True,
+                    forwarded_allow_ips="*",
                     # HIGH CONCURRENCY SETTINGS
                     limit_concurrency=200,  # Increase for HTTP endpoints + MCP
                     limit_max_requests=100000,  # Higher request limit
diff --git a/eval_protocol/mcp_env.py b/eval_protocol/mcp_env.py
@@ -104,17 +104,17 @@ def make(
     if evaluation_rows:
         for i, row in enumerate(evaluation_rows):
             dataset_info = row.input_metadata.dataset_info if row.input_metadata else {}
-            
+
             system_message = row.get_system_message()
             system_prompt = system_message.content or ""
-            
+
             dataset_entry = {
                 "id": row.input_metadata.row_id if row.input_metadata and row.input_metadata.row_id else f"task_{i}",
                 "system_prompt": system_prompt,
                 "user_prompt_template": dataset_info.get("user_prompt_template", ""),
                 "environment_context": dataset_info.get("environment_context", {}),
                 "user_simulation": dataset_info.get("user_simulation", {}),
-                "evaluation_criteria": dataset_info.get("evaluation_criteria", {})
+                "evaluation_criteria": dataset_info.get("evaluation_criteria", {}),
             }
             internal_dataset.append(dataset_entry)
     elif dataset:
diff --git a/pyproject.toml b/pyproject.toml
@@ -49,6 +49,7 @@ dependencies = [
     "pandas>=1.5.0",
     "watchdog>=2.1.0",
     "websockets>=15.0.1",
+    "fireworks-ai>=0.19.12",
 ]
 
 [project.urls]
diff --git a/tests/pytest/data/halueval_sample_dataset.jsonl b/tests/pytest/data/halueval_sample_dataset.jsonl
@@ -0,0 +1,3 @@
+{"knowledge": " It is a hygroscopic solid that is highly soluble in water and slightly soluble in alcohol.Ethanol, also called alcohol, ethyl alcohol, and drinking alcohol, is a compound and simple alcohol with the chemical formula C2H5OH .", "question": "Cadmium Chloride is slightly soluble in this chemical, it is also called what?", "right_answer": "alcohol", "hallucinated_answer": "water with a hint of alcohol"}
+{"knowledge": "The Great Outdoors is a 1988 American comedy film directed by Howard Deutch, and written and produced by John Hughes. It stars Dan Aykroyd, John Candy, Stephanie Faracy and Annette Bening in her film debut.Annette Carol Bening (born May 29, 1958) is an American actress. She is a four-time Academy Award nominee; for \"The Grifters\" (1990), \"American Beauty\" (1999), \"Being Julia\" (2004) and \"The Kids Are All Right\" (2010). In 2006, she received a star on the Hollywood Walk of Fame.", "question": "The 1988 American comedy film, The Great Outdoors, starred a four-time Academy Award nominee, who received a star on the Hollywood Walk of Fame in what year?", "right_answer": "2006", "hallucinated_answer": "Annette Bening received her Hollywood star in 1988."}
+{"knowledge": " Her self-titled debut studio album was released on 2 June 2017.\"New Rules\" is a song by English singer Dua Lipa from her eponymous debut studio album (2017).", "question": "Dua Lipa, an English singer, songwriter and model, the album spawned the number-one single \"New Rules\" is a song by English singer Dua Lipa from her eponymous debut studio album, released in what year?", "right_answer": "2017", "hallucinated_answer": "The album was released in 2018."}
diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py
@@ -0,0 +1,119 @@
+"""
+Hallucination detection test using LLM-as-judge.
+
+This test demonstrates how to detect factual inaccuracies in model responses
+by comparing them against provided knowledge using an LLM judge, similar to 
+tau's evaluate_nl_assertions approach.
+"""
+
+import json
+from typing import Any, Dict, List
+
+from fireworks import LLM
+
+from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
+from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
+
+judge_llm = LLM(model="accounts/fireworks/models/kimi-k2-instruct", deployment_type="serverless")
+
+
+def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
+    """Convert HaluEval dataset to EvaluationRow objects."""
+    return [
+        EvaluationRow(
+            messages=[Message(role="user", content=f"Knowledge: {item['knowledge']}\n\nQuestion: {item['question']}")],
+            ground_truth=item["right_answer"]
+        )
+        for item in data
+    ]
+
+
+@evaluation_test(
+    input_dataset=["tests/pytest/data/halueval_sample_dataset.jsonl"],
+    dataset_adapter=hallucination_dataset_adapter,
+    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}],
+    rollout_processor=default_single_turn_rollout_processor,
+    threshold_of_success=1.0,
+    num_runs=1,
+    mode="pointwise",
+)
+def test_hallucination_detection(row: EvaluationRow) -> EvaluationRow:
+    """
+    Test for response correctness using LLM-as-judge.
+    """
+    messages = row.messages
+    assistant_response = messages[-1].content
+
+    if not assistant_response:
+        return EvaluateResult(score=0.0, reason="❌ No assistant response found")
+
+    correct_answer = row.ground_truth
+    
+    system_prompt = """
+    TASK
+    - You will be given an assistant's response and the correct answer.
+    - Your job is to evaluate whether the assistant's response is factually consistent with the correct answer.
+    - Grade whether the assistant got it right or wrong.
+
+    FORMAT
+    - Your response should be a JSON object with the following fields:
+    - `reasoning`: a short explanation for your classification
+    - `is_correct`: `true` if the assistant's response matches the correct answer, `false` otherwise
+
+    Example response structure:
+    {
+        "reasoning": "<reasoning trace>",
+        "is_correct": <true or false>
+    }
+    """
+
+    user_prompt = f"""
+    assistant_response:
+    {assistant_response}
+
+    correct_answer:
+    {correct_answer}
+    """
+
+    try:
+        response = judge_llm.chat.completions.create(
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ],
+            temperature=0.1,
+            max_tokens=500,
+        )
+        
+        result_data = json.loads(response.choices[0].message.content)
+        is_correct = result_data.get("is_correct", False)
+        reasoning = result_data.get("reasoning", "Could not parse reasoning")
+        
+    except Exception as e:
+        # Fallback if parsing fails
+        is_correct = False
+        reasoning = f"Evaluation failed: {str(e)}"
+    
+    score = 1.0 if is_correct else 0.0
+    
+    if is_correct:
+        assessment = "✅ Response is correct"
+    else:
+        assessment = "❌ Response is incorrect"
+    
+    reason = f"{assessment}\nReasoning: {reasoning}"
+
+    row.evaluation_result = EvaluateResult(
+        score=score,
+        reason=reason,
+        metrics={
+            "llm_judge": MetricResult(
+                score=score,
+                reason=reasoning,
+                is_score_valid=True
+            )
+        }
+    )
+    
+    return row
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,7 @@ dependencies = [`
`49`	`49`	`"pandas>=1.5.0",`
`50`	`50`	`"watchdog>=2.1.0",`
`51`	`51`	`"websockets>=15.0.1",`
	`52`	`+ "fireworks-ai>=0.19.12",`
`52`	`53`	`]`
`53`	`54`
`54`	`55`	`[project.urls]`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{"knowledge": " It is a hygroscopic solid that is highly soluble in water and slightly soluble in alcohol.Ethanol, also called alcohol, ethyl alcohol, and drinking alcohol, is a compound and simple alcohol with the chemical formula C2H5OH .", "question": "Cadmium Chloride is slightly soluble in this chemical, it is also called what?", "right_answer": "alcohol", "hallucinated_answer": "water with a hint of alcohol"}`
	`2`	+{"knowledge": "The Great Outdoors is a 1988 American comedy film directed by Howard Deutch, and written and produced by John Hughes. It stars Dan Aykroyd, John Candy, Stephanie Faracy and Annette Bening in her film debut.Annette Carol Bening (born May 29, 1958) is an American actress. She is a four-time Academy Award nominee; for \"The Grifters\" (1990), \"American Beauty\" (1999), \"Being Julia\" (2004) and \"The Kids Are All Right\" (2010). In 2006, she received a star on the Hollywood Walk of Fame.", "question": "The 1988 American comedy film, The Great Outdoors, starred a four-time Academy Award nominee, who received a star on the Hollywood Walk of Fame in what year?", "right_answer": "2006", "hallucinated_answer": "Annette Bening received her Hollywood star in 1988."}
	`3`	`+{"knowledge": " Her self-titled debut studio album was released on 2 June 2017.\"New Rules\" is a song by English singer Dua Lipa from her eponymous debut studio album (2017).", "question": "Dua Lipa, an English singer, songwriter and model, the album spawned the number-one single \"New Rules\" is a song by English singer Dua Lipa from her eponymous debut studio album, released in what year?", "right_answer": "2017", "hallucinated_answer": "The album was released in 2018."}`