test_pydantic_multi_agent runs

Dylan Huang · Dylan Huang · commit eae0959911d3 · 2025-08-28T22:06:02.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+experiment_results/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/eval_protocol/pytest/parameterize.py b/eval_protocol/pytest/parameterize.py
@@ -44,7 +44,6 @@ def pytest_parametrize(
         argnames.append("evaluation_test_kwargs")
 
     argvalues: list[ParameterSet | Sequence[object] | object] = []
-    param_tuples: list[tuple[object, ...]] = []
     for combo in combinations:
         dataset, cp, messages, rows, etk = combo
         param_tuple: list[object] = []
@@ -63,7 +62,7 @@ def pytest_parametrize(
             raise ValueError(
                 f"The length of argnames ({len(argnames)}) is not the same as the length of param_tuple ({len(param_tuple)})"
             )
-        param_tuples.append(tuple(param_tuple))
+        argvalues.append(tuple(param_tuple))
 
     return PytestParametrizeArgs(argnames=argnames, argvalues=argvalues)
 
diff --git a/eval_protocol/pytest/validate_signature.py b/eval_protocol/pytest/validate_signature.py
@@ -1,10 +1,26 @@
 from collections.abc import Sequence
 from inspect import Signature
+from typing import get_origin, get_args
 
 from eval_protocol.models import CompletionParams, EvaluationRow
 from eval_protocol.pytest.types import EvaluationTestMode
 
 
+def _is_list_of_evaluation_row(annotation) -> bool:  # pyright: ignore[reportUnknownParameterType, reportMissingParameterType]
+    """Check if annotation is list[EvaluationRow] or equivalent."""
+    origin = get_origin(annotation)  # pyright: ignore[reportUnknownArgumentType, reportAny]
+    if origin is not list:
+        return False
+
+    args = get_args(annotation)
+    if len(args) != 1:
+        return False
+
+    # Check if the single argument is EvaluationRow or equivalent
+    arg = args[0]  # pyright: ignore[reportAny]
+    return arg is EvaluationRow or str(arg) == str(EvaluationRow)  # pyright: ignore[reportAny]
+
+
 def validate_signature(
     signature: Signature, mode: EvaluationTestMode, completion_params: Sequence[CompletionParams | None] | None
 ) -> None:
@@ -29,11 +45,13 @@ def validate_signature(
             raise ValueError("In groupwise mode, your eval function must have a parameter named 'rows'")
 
         # validate that "Rows" is of type List[EvaluationRow]
-        if signature.parameters["rows"].annotation is not list[EvaluationRow]:  # pyright: ignore[reportAny]
-            raise ValueError("In groupwise mode, the 'rows' parameter must be of type List[EvaluationRow")
+        if not _is_list_of_evaluation_row(signature.parameters["rows"].annotation):  # pyright: ignore[reportAny]
+            raise ValueError(
+                f"In groupwise mode, the 'rows' parameter must be of type List[EvaluationRow]. Got {str(signature.parameters['rows'].annotation)} instead"  # pyright: ignore[reportAny]
+            )
 
         # validate that the function has a return type of List[EvaluationRow]
-        if signature.return_annotation is not list[EvaluationRow]:  # pyright: ignore[reportAny]
+        if not _is_list_of_evaluation_row(signature.return_annotation):  # pyright: ignore[reportAny]
             raise ValueError("In groupwise mode, your eval function must return a list of EvaluationRow instances")
         if completion_params is not None and len(completion_params) < 2:
             raise ValueError("In groupwise mode, you must provide at least 2 completion parameters")
@@ -43,9 +61,11 @@ def validate_signature(
             raise ValueError("In all mode, your eval function must have a parameter named 'rows'")
 
         # validate that "Rows" is of type List[EvaluationRow]
-        if signature.parameters["rows"].annotation is not list[EvaluationRow]:  # pyright: ignore[reportAny]
-            raise ValueError("In all mode, the 'rows' parameter must be of type List[EvaluationRow")
+        if not _is_list_of_evaluation_row(signature.parameters["rows"].annotation):  # pyright: ignore[reportAny]
+            raise ValueError(
+                f"In all mode, the 'rows' parameter must be of type list[EvaluationRow]. Got {str(signature.parameters['rows'].annotation)} instead"  # pyright: ignore[reportAny]
+            )
 
         # validate that the function has a return type of List[EvaluationRow]
-        if signature.return_annotation is not list[EvaluationRow]:  # pyright: ignore[reportAny]
+        if not _is_list_of_evaluation_row(signature.return_annotation):  # pyright: ignore[reportAny]
             raise ValueError("In all mode, your eval function must return a list of EvaluationRow instances")
diff --git a/tests/pytest/test_get_metadata.py b/tests/pytest/test_get_metadata.py
@@ -1,5 +1,4 @@
 import asyncio
-from typing import Dict, List
 
 from eval_protocol.pytest import evaluation_test
 from eval_protocol.models import EvaluationRow, Message
@@ -19,16 +18,16 @@
     max_concurrent_rollouts=5,
     max_concurrent_evaluations=10,
 )
-def test_pytest_async(rows: List[EvaluationRow]) -> List[EvaluationRow]:
+def test_pytest_async(rows: list[EvaluationRow]) -> list[EvaluationRow]:
     """Run math evaluation on sample dataset using pytest interface."""
     return rows
 
 
 def test_pytest_func_metainfo():
     assert hasattr(test_pytest_async, "_origin_func")
-    origin_func = test_pytest_async._origin_func
-    assert not asyncio.iscoroutinefunction(origin_func)
+    origin_func = test_pytest_async._origin_func  # pyright: ignore[reportAny, reportFunctionMemberAccess]
+    assert not asyncio.iscoroutinefunction(origin_func)  # pyright: ignore[reportAny]
     assert asyncio.iscoroutinefunction(test_pytest_async)
-    assert test_pytest_async._metainfo["mode"] == "groupwise"
-    assert test_pytest_async._metainfo["max_rollout_concurrency"] == 5
-    assert test_pytest_async._metainfo["max_evaluation_concurrency"] == 10
+    assert test_pytest_async._metainfo["mode"] == "groupwise"  # pyright: ignore[reportAny, reportFunctionMemberAccess]
+    assert test_pytest_async._metainfo["max_rollout_concurrency"] == 5  # pyright: ignore[reportAny, reportFunctionMemberAccess]
+    assert test_pytest_async._metainfo["max_evaluation_concurrency"] == 10  # pyright: ignore[reportAny, reportFunctionMemberAccess]
diff --git a/tests/pytest/test_pytest_async.py b/tests/pytest/test_pytest_async.py
@@ -1,11 +1,7 @@
-import asyncio
-from typing import List
-
 import pytest
 
 from eval_protocol.models import EvaluationRow, Message
 from eval_protocol.pytest import evaluation_test
-from examples.math_example.main import evaluate as math_evaluate
 
 
 @evaluation_test(
@@ -20,7 +16,7 @@
     completion_params=[{"model": "accounts/fireworks/models/kimi-k2-instruct"}],
     mode="all",
 )
-async def test_pytest_async(rows: List[EvaluationRow]) -> List[EvaluationRow]:
+async def test_pytest_async(rows: list[EvaluationRow]) -> list[EvaluationRow]:
     """Run math evaluation on sample dataset using pytest interface."""
     return rows
 
@@ -51,7 +47,7 @@ async def test_pytest_async_main():
             ],
         )
     ]
-    result = await test_pytest_async(rows)
+    result = await test_pytest_async(rows)  # pyright: ignore[reportGeneralTypeIssues, reportUnknownVariableType, reportArgumentType, reportCallIssue]
     assert result == rows
 
 
@@ -65,5 +61,5 @@ async def test_pytest_async_pointwise_main():
             Message(role="user", content="What is the capital of France?"),
         ],
     )
-    result = await test_pytest_async_pointwise(row)
+    result = await test_pytest_async_pointwise(row)  # pyright: ignore[reportGeneralTypeIssues, reportArgumentType, reportUnknownVariableType, reportCallIssue]
     assert result == row

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+experiment_results/`
	`2`	`+`
`1`	`3`	`# Byte-compiled / optimized / DLL files`
`2`	`4`	`__pycache__/`
`3`	`5`	`*.py[cod]`