format

mayinghan · mayinghan · commit 999ed18a5d40 · 2025-08-19T21:55:53.000-07:00
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -729,13 +729,15 @@ async def _collect_result(config, lst, max_retry):
                 except Exception:
                     _log_eval_error("error", data if "data" in locals() else None, passed=False)
                     raise
+
             if asyncio.iscoroutinefunction(test_func):
                 return create_dynamically_parameterized_wrapper(test_func, wrapper_body, test_param_names)
             else:
+
                 def sync_wrapper_body(**kwargs):
                     return asyncio.run(wrapper_body(**kwargs))
-                return create_dynamically_parameterized_wrapper(test_func, sync_wrapper_body, test_param_names)
 
+                return create_dynamically_parameterized_wrapper(test_func, sync_wrapper_body, test_param_names)
 
         # Create the pytest wrapper
         pytest_wrapper = create_wrapper_with_signature()
@@ -763,6 +765,7 @@ def create_dual_mode_wrapper() -> Callable:
             is_async = asyncio.iscoroutinefunction(test_func)
 
             if is_async:
+
                 async def dual_mode_wrapper(*args, **kwargs):
                     # Check if this is a direct call with the expected signature
                     if mode == "pointwise":
@@ -789,20 +792,30 @@ async def dual_mode_wrapper(*args, **kwargs):
 
                     # If not a direct call, use the pytest wrapper
                     return await pytest_wrapper(*args, **kwargs)
-                
+
                 _dual_model_wrapper_fn = dual_mode_wrapper
             else:
+
                 def dual_mode_wrapper(*args, **kwargs):
                     if mode == "pointwise":
                         if len(args) == 1 and isinstance(args[0], EvaluationRow) and not kwargs:
                             return test_func(row=args[0])
                     else:
-                        if len(args) == 1 and isinstance(args[0], list) and all(isinstance(r, EvaluationRow) for r in args[0]) and not kwargs:
+                        if (
+                            len(args) == 1
+                            and isinstance(args[0], list)
+                            and all(isinstance(r, EvaluationRow) for r in args[0])
+                            and not kwargs
+                        ):
                             return test_func(rows=args[0])
-                        if "rows" in kwargs and isinstance(kwargs["rows"], list) and all(isinstance(r, EvaluationRow) for r in kwargs["rows"]):
+                        if (
+                            "rows" in kwargs
+                            and isinstance(kwargs["rows"], list)
+                            and all(isinstance(r, EvaluationRow) for r in kwargs["rows"])
+                        ):
                             return test_func(**kwargs)
                     return pytest_wrapper(*args, **kwargs)
-                
+
                 _dual_model_wrapper_fn = dual_mode_wrapper
 
             # Copy all attributes from the pytest wrapper to our dual mode wrapper
diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py
@@ -99,10 +99,12 @@ def create_dynamically_parameterized_wrapper(test_func, wrapper_body, test_param
     from functools import wraps
 
     if asyncio.iscoroutinefunction(wrapper_body):
+
         @wraps(test_func)
         async def wrapper(**kwargs):
             return await wrapper_body(**kwargs)
     else:
+
         @wraps(test_func)
         def wrapper(**kwargs):
             return wrapper_body(**kwargs)
diff --git a/tests/pytest/test_direct_run.py b/tests/pytest/test_direct_run.py
@@ -15,7 +15,7 @@
     ],
     completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
     rollout_processor=SingleTurnRolloutProcessor(),
-    mode="listwise",
+    mode="all",
 )
 def test_direct_run(rows: List[EvaluationRow]) -> List[EvaluationRow]:
     """Run math evaluation on sample dataset using pytest interface."""
@@ -53,7 +53,7 @@ def test_direct_run_main():
     ],
     completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
     rollout_processor=SingleTurnRolloutProcessor(),
-    mode="listwise",
+    mode="all",
 )
 async def test_direct_run_async(rows: List[EvaluationRow]) -> List[EvaluationRow]:
     """Run math evaluation on sample dataset using pytest interface."""
@@ -62,21 +62,22 @@ async def test_direct_run_async(rows: List[EvaluationRow]) -> List[EvaluationRow
     return rows
 
 
-
 @pytest.mark.asyncio
 async def test_direct_run_async_main():
     rows = [
         EvaluationRow(
             messages=[
-                Message(role="user", content="What is the capital of France?"),
+                Message(role="user", content="1"),
             ],
         ),
         EvaluationRow(
             messages=[
-                Message(role="user", content="What is the capital of the moon?"),
+                Message(role="user", content="2"),
             ],
         ),
     ]
     res = await test_direct_run_async(rows)
+    assert res[0].messages[0].content == "1"
+    assert res[1].messages[0].content == "2"
     assert res[0].evaluation_result.score == 0
-    assert res[1].evaluation_result.score == 1
+    assert res[1].evaluation_result.score == 1