changing other files

xzrderek · xzrderek · commit a8f80feb39eb · 2025-08-10T02:58:54.000-07:00
diff --git a/tests/pytest/test_apps_coding.py b/tests/pytest/test_apps_coding.py
@@ -18,10 +18,7 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
     Convert entries from APPS dataset to EvaluationRow objects.
     """
     return [
-        EvaluationRow(
-            messages=[Message(role="user", content=row["question"])],
-            ground_truth=row["input_output"]
-        )
+        EvaluationRow(messages=[Message(role="user", content=row["question"])], ground_truth=row["input_output"])
         for row in data
     ]
 
@@ -31,7 +28,7 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
     dataset_adapter=apps_dataset_to_evaluation_row,
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
-    threshold_of_success=0.33,
+    threshold=0.33,
     rollout_processor=default_single_turn_rollout_processor,
     num_runs=1,
     mode="pointwise",
@@ -42,7 +39,7 @@ def test_apps_code_evaluation(row: EvaluationRow) -> EvaluationRow:
 
     Args:
         row: EvaluationRow containing the conversation messages and ground_truth as JSON string
-    
+
     Returns:
         EvaluationRow with the evaluation result
     """
@@ -51,8 +48,8 @@ def test_apps_code_evaluation(row: EvaluationRow) -> EvaluationRow:
         messages=row.messages,
         ground_truth=row.ground_truth,
     )
-    
+
     # Set the evaluation result on the row
     row.evaluation_result = result
-    
-    return row 
+
+    return row
diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py
@@ -9,7 +9,7 @@
 
 from eval_protocol.models import EvaluateResult, EvaluationRow, Message
 from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
-from eval_protocol.rewards.code_execution import extract_code_blocks, execute_python_code
+from eval_protocol.rewards.code_execution import execute_python_code, extract_code_blocks
 
 
 def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
@@ -18,8 +18,8 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
     """
     return [
         EvaluationRow(
-            messages=[Message(role="user", content=f"{row['prompt']} Input: {row['input']}")], 
-            ground_truth=row["expected_output"]
+            messages=[Message(role="user", content=f"{row['prompt']} Input: {row['input']}")],
+            ground_truth=row["expected_output"],
         )
         for row in data
     ]
@@ -30,63 +30,59 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
     dataset_adapter=coding_dataset_to_evaluation_row,
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
-    threshold_of_success=0.8,
+    threshold=0.8,
     rollout_processor=default_single_turn_rollout_processor,
     num_runs=1,
     mode="pointwise",
 )
 def test_coding_code_evaluation(row: EvaluationRow) -> EvaluationRow:
     """
     Evaluation function that tests code correctness by executing it locally.
-    
+
     This function:
     1. Extracts Python code from the assistant's response
     2. Executes the code locally with timeout=10
     3. Compares the output to ground_truth
     4. Returns a score of 1.0 if output matches, 0.0 otherwise
-    
+
     Args:
         row: EvaluationRow containing the conversation messages and expected_output in ground_truth
-        
+
     Returns:
         EvaluationRow with the evaluation result
     """
     # Check if we have an assistant response
     if len(row.messages) < 2 or row.messages[-1].role != "assistant":
         row.evaluation_result = EvaluateResult(score=0.0, reason="No assistant response found")
         return row
-    
+
     assistant_content = row.messages[-1].content or ""
     expected_output = (row.ground_truth or "").strip()
-    
+
     # Extract Python code blocks
     code_blocks = extract_code_blocks(assistant_content, language="python")
     if not code_blocks:
         row.evaluation_result = EvaluateResult(score=0.0, reason="No Python code block found")
         return row
-    
+
     code = code_blocks[0]["code"]
-    
+
     # Execute the code locally
     execution_result = execute_python_code(code, timeout=10)
-    
+
     if not execution_result.get("success", False):
         error_msg = execution_result.get("error", "Code execution failed")
         row.evaluation_result = EvaluateResult(score=0.0, reason=f"Execution error: {error_msg}")
         return row
-    
+
     # Compare output with expected
     actual_output = (execution_result.get("output", "") or "").strip()
-    
+
     if actual_output == expected_output:
-        row.evaluation_result = EvaluateResult(
-            score=1.0, 
-            reason=f"✅ Output matches: '{actual_output}'"
-        )
+        row.evaluation_result = EvaluateResult(score=1.0, reason=f"✅ Output matches: '{actual_output}'")
     else:
         row.evaluation_result = EvaluateResult(
-            score=0.0, 
-            reason=f"❌ Expected: '{expected_output}', Got: '{actual_output}'"
+            score=0.0, reason=f"❌ Expected: '{expected_output}', Got: '{actual_output}'"
         )
-    
+
     return row
diff --git a/tests/pytest/test_frozen_lake.py b/tests/pytest/test_frozen_lake.py
@@ -7,7 +7,7 @@
 
 from typing import Any, Dict, List
 
-from eval_protocol.models import EvaluateResult, EvaluationRow, Message, InputMetadata, CompletionParams, MetricResult
+from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message, MetricResult
 from eval_protocol.pytest import evaluation_test
 from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor
 
@@ -41,7 +41,7 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
     rollout_processor=default_mcp_gym_rollout_processor,
-    threshold_of_success=0.66,
+    threshold=0.66,
     num_runs=1,
     max_concurrent_rollouts=3,
     mode="pointwise",
diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py
@@ -35,7 +35,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}],
     rollout_processor=default_single_turn_rollout_processor,
-    threshold_of_success=0.33,
+    threshold=0.33,
     num_runs=1,
     mode="pointwise",
 )
diff --git a/tests/pytest/test_lunar_lander.py b/tests/pytest/test_lunar_lander.py
@@ -7,7 +7,7 @@
 
 from typing import Any, Dict, List
 
-from eval_protocol.models import EvaluateResult, EvaluationRow, Message, InputMetadata, CompletionParams
+from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message
 from eval_protocol.pytest import evaluation_test
 from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor
 
@@ -17,7 +17,7 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
     Convert entries from lunar lander dataset to EvaluationRow objects.
     """
     rows = []
-    
+
     for row in data:
         eval_row = EvaluationRow(
             messages=[Message(role="system", content=row["system_prompt"])],
@@ -26,12 +26,12 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
                 dataset_info={
                     "environment_context": row["environment_context"],
                     "user_prompt_template": row["user_prompt_template"],
-                }
-            )
+                },
+            ),
         )
-        
+
         rows.append(eval_row)
-    
+
     return rows
 
 
@@ -41,7 +41,7 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
     model=["gpt-4.1"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
     rollout_processor=default_mcp_gym_rollout_processor,
-    threshold_of_success=0.0,
+    threshold=0.0,
     num_runs=1,
     mode="pointwise",
     max_concurrent_rollouts=3,
@@ -51,24 +51,28 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
 def test_lunar_lander_evaluation(row: EvaluationRow) -> EvaluationRow:
     """
     Test lunar lander evaluation using the pytest framework.
-    
+
     This test evaluates how well the model can control the lunar lander to achieve
     a successful landing by checking the final reward and termination status.
-    
+
     Args:
         row: EvaluationRow object from lunar lander dataset
-        
+
     Returns:
         EvaluationRow object with evaluation results
     """
     score = row.get_total_reward()
 
     evaluation_score = 1.0 if score >= 200 else 0.0
-    reason = f"✅ Successful landing with reward {score:.2f}" if score >= 200 else f"❌ Failed landing with reward {score:.2f}"
+    reason = (
+        f"✅ Successful landing with reward {score:.2f}"
+        if score >= 200
+        else f"❌ Failed landing with reward {score:.2f}"
+    )
 
     row.evaluation_result = EvaluateResult(
         score=evaluation_score,
         reason=reason,
     )
-    
-    return row 
+
+    return row
diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py
@@ -26,7 +26,7 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
     dataset_adapter=markdown_dataset_to_evaluation_row,
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
-    threshold_of_success=0.5,
+    threshold=0.5,
     rollout_processor=default_single_turn_rollout_processor,
     num_runs=1,
     mode="pointwise",
diff --git a/tests/pytest/test_pytest_math_example.py b/tests/pytest/test_pytest_math_example.py
@@ -11,7 +11,7 @@
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
-    threshold_of_success=0.0,
+    threshold=0.0,
     rollout_processor=default_single_turn_rollout_processor,
     mode="pointwise",
     evaluation_test_kwargs=[
diff --git a/tests/pytest/test_pytest_math_format_length.py b/tests/pytest/test_pytest_math_format_length.py
@@ -14,7 +14,7 @@
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
-    threshold_of_success=0.0,
+    threshold=0.0,
     rollout_processor=default_single_turn_rollout_processor,
     mode="pointwise",
     evaluation_test_kwargs=[
diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py
@@ -11,7 +11,7 @@
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
-    threshold_of_success=0.3,  # Reasonable threshold for word count evaluation
+    threshold=0.3,  # Reasonable threshold for word count evaluation
     rollout_processor=default_single_turn_rollout_processor,
     mode="pointwise",  # Use pointwise mode for elegant row-by-row evaluation
 )

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation`
`35`	`35`	`model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],`
`36`	`36`	`rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}],`
`37`	`37`	`rollout_processor=default_single_turn_rollout_processor,`
`38`		`- threshold_of_success=0.33,`
	`38`	`+ threshold=0.33,`
`39`	`39`	`num_runs=1,`
`40`	`40`	`mode="pointwise",`
`41`	`41`	`)`
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@`
`11`	`11`	`model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],`
`12`	`12`	`rollout_input_params=[{"temperature": 0.0}],`
`13`	`13`	`max_dataset_rows=5,`
`14`		`- threshold_of_success=0.3, # Reasonable threshold for word count evaluation`
	`14`	`+ threshold=0.3, # Reasonable threshold for word count evaluation`
`15`	`15`	`rollout_processor=default_single_turn_rollout_processor,`
`16`	`16`	`mode="pointwise", # Use pointwise mode for elegant row-by-row evaluation`
`17`	`17`	`)`