rename listwise to all

mayinghan · mayinghan · commit d000f19551f5 · 2025-08-19T19:00:55.000-07:00
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -73,11 +73,9 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
 
             _litellm = importlib.import_module("litellm")
             acompletion = getattr(_litellm, "acompletion")
-            logger.debug(f"********** request_params: {request_params} **********")
             response = await acompletion(**request_params)
 
             assistant_content = response.choices[0].message.content or ""
-            logger.debug(f"********** assistant_content: {assistant_content} **********")
             tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None
 
             converted_tool_calls = None
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -310,7 +310,7 @@ def evaluation_test(  # noqa: C901
         steps: Number of rollout steps to execute (default: 30).
         mode: Evaluation mode. "pointwise" (default) applies test function to each row (rollout result).
             "groupwise" applies test function to a group of rollout results from the same original row (for use cases such as dpo/grpo).
-            "listwise" applies test function to the whole dataset.
+            "all" applies test function to the whole dataset.
         logger: DatasetLogger to use for logging. If not provided, a default logger will be used.
     """
 
@@ -349,29 +349,29 @@ def decorator(
             # additional check for groupwise evaluation
         elif mode == "groupwise":
             if "rows" not in sig.parameters:
-                raise ValueError("In listwise mode, your eval function must have a parameter named 'rows'")
+                raise ValueError("In groupwise mode, your eval function must have a parameter named 'rows'")
 
             # validate that "Rows" is of type List[EvaluationRow]
             if sig.parameters["rows"].annotation is not List[EvaluationRow]:
-                raise ValueError("In listwise mode, the 'rows' parameter must be of type List[EvaluationRow")
+                raise ValueError("In groupwise mode, the 'rows' parameter must be of type List[EvaluationRow")
 
             # validate that the function has a return type of List[EvaluationRow]
             if sig.return_annotation is not List[EvaluationRow]:
-                raise ValueError("In listwise mode, your eval function must return a list of EvaluationRow instances")
+                raise ValueError("In groupwise mode, your eval function must return a list of EvaluationRow instances")
             if len(completion_params) < 2:
                 raise ValueError("In groupwise mode, you must provide at least 2 completion parameters")
         else:
-            # listwise mode: function should accept input_dataset and model
+            # all mode: function should accept input_dataset and model
             if "rows" not in sig.parameters:
-                raise ValueError("In batch mode, your eval function must have a parameter named 'rows'")
+                raise ValueError("In all mode, your eval function must have a parameter named 'rows'")
 
             # validate that "Rows" is of type List[EvaluationRow]
             if sig.parameters["rows"].annotation is not List[EvaluationRow]:
-                raise ValueError("In batch mode, the 'rows' parameter must be of type List[EvaluationRow")
+                raise ValueError("In all mode, the 'rows' parameter must be of type List[EvaluationRow")
 
             # validate that the function has a return type of List[EvaluationRow]
             if sig.return_annotation is not List[EvaluationRow]:
-                raise ValueError("In listwise mode, your eval function must return a list of EvaluationRow instances")
+                raise ValueError("In all mode, your eval function must return a list of EvaluationRow instances")
 
         async def execute_with_params(
             test_func: TestFunction,
@@ -434,7 +434,7 @@ async def execute_with_params(
                 param_tuple.append(etk)
             param_tuples.append(tuple(param_tuple))
 
-        # For listwise mode, preserve the original parameter names
+        # For all mode, preserve the original parameter names
         test_param_names = []
         if input_dataset is not None:
             test_param_names.append("dataset_path")
diff --git a/eval_protocol/pytest/types.py b/eval_protocol/pytest/types.py
@@ -19,11 +19,11 @@
 
 Dataset = List[EvaluationRow]
 
-EvaluationTestMode = Literal["pointwise", "groupwise", "listwise"]
+EvaluationTestMode = Literal["pointwise", "groupwise", "all"]
 """
 "pointwise": (default) applies test function to each row (rollout result).
 "groupwise": applies test function to a group of rollout results from the same original row (for use cases such as dpo/grpo).
-"listwise": applies test function to the whole dataset.
+"all": applies test function to the whole dataset.
 """
 
 """
diff --git a/tests/pytest/test_pytest_async.py b/tests/pytest/test_pytest_async.py
@@ -18,7 +18,7 @@
         ],
     ],
     completion_params=[{"model": "accounts/fireworks/models/kimi-k2-instruct"}],
-    mode="listwise",
+    mode="all",
 )
 async def test_pytest_async(rows: List[EvaluationRow]) -> List[EvaluationRow]:
     """Run math evaluation on sample dataset using pytest interface."""
@@ -32,7 +32,7 @@ async def test_pytest_async(rows: List[EvaluationRow]) -> List[EvaluationRow]:
         ],
     ],
     completion_params=[{"model": "accounts/fireworks/models/kimi-k2-instruct"}],
-    mode="pointwise",
+    mode="all",
 )
 async def test_pytest_async_pointwise(row: EvaluationRow) -> EvaluationRow:
     """Run pointwise evaluation on sample dataset using pytest interface."""
diff --git a/tests/pytest/test_pytest_default_agent_rollout_processor.py b/tests/pytest/test_pytest_default_agent_rollout_processor.py
@@ -18,7 +18,7 @@
     ],
     rollout_processor=AgentRolloutProcessor(),
     completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}],
-    mode="listwise",
+    mode="all",
 )
 def test_pytest_default_agent_rollout_processor(rows: List[EvaluationRow]) -> List[EvaluationRow]:
     """Run math evaluation on sample dataset using pytest interface."""
diff --git a/tests/pytest/test_pytest_input_messages.py b/tests/pytest/test_pytest_input_messages.py
@@ -12,7 +12,7 @@
     ],
     completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
     rollout_processor=SingleTurnRolloutProcessor(),
-    mode="listwise",
+    mode="all",
 )
 def test_input_messages_in_decorator(rows: List[EvaluationRow]) -> List[EvaluationRow]:
     """Run math evaluation on sample dataset using pytest interface."""

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@`
`18`	`18`	`],`
`19`	`19`	`rollout_processor=AgentRolloutProcessor(),`
`20`	`20`	`completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}],`
`21`		`- mode="listwise",`
	`21`	`+ mode="all",`
`22`	`22`	`)`
`23`	`23`	`def test_pytest_default_agent_rollout_processor(rows: List[EvaluationRow]) -> List[EvaluationRow]:`
`24`	`24`	`"""Run math evaluation on sample dataset using pytest interface."""`
Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@`
`12`	`12`	`],`
`13`	`13`	`completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],`
`14`	`14`	`rollout_processor=SingleTurnRolloutProcessor(),`
`15`		`- mode="listwise",`
	`15`	`+ mode="all",`
`16`	`16`	`)`
`17`	`17`	`def test_input_messages_in_decorator(rows: List[EvaluationRow]) -> List[EvaluationRow]:`
`18`	`18`	`"""Run math evaluation on sample dataset using pytest interface."""`