eval-protocol
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 5 additions & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 23 additions & 31 deletions b/‎README.md‎
Lines changed: 23 additions & 31 deletions
diff --git a/‎eval_protocol/models.py‎
Lines changed: 12 additions & 9 deletions b/‎eval_protocol/models.py‎
Lines changed: 12 additions & 9 deletions
diff --git a/‎eval_protocol/pytest/__init__.py‎
Lines changed: 13 additions & 0 deletions b/‎eval_protocol/pytest/__init__.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎eval_protocol/pytest/default_pydantic_ai_rollout_processor.py‎
Lines changed: 148 additions & 0 deletions b/‎eval_protocol/pytest/default_pydantic_ai_rollout_processor.py‎
Lines changed: 148 additions & 0 deletions
diff --git a/‎eval_protocol/pytest/evaluation_test.py‎
Lines changed: 9 additions & 5 deletions b/‎eval_protocol/pytest/evaluation_test.py‎
Lines changed: 9 additions & 5 deletions
@@ -86,6 +86,11 @@ jobs:
           E2B_API_KEY: ${{ secrets.E2B_API_KEY }}
           FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
           FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
+          SUPABASE_PASSWORD: ${{ secrets.SUPABASE_PASSWORD }}
+          SUPABASE_HOST: ${{ secrets.SUPABASE_HOST }}
+          SUPABASE_PORT: ${{ secrets.SUPABASE_PORT }}
+          SUPABASE_DATABASE: ${{ secrets.SUPABASE_DATABASE }}
+          SUPABASE_USER: ${{ secrets.SUPABASE_USER }}
           PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning"
         run: |
           # Run most tests in parallel, but explicitly ignore tests that manage their own servers or are slow
 
@@ -2,61 +2,53 @@
 
 [![PyPI - Version](https://img.shields.io/pypi/v/eval-protocol)](https://pypi.org/project/eval-protocol/)
 
-**Eval Protocol (EP) is the open-source standard and toolkit for practicing Eval-Driven Development.**
+**The open-source toolkit for building your internal model leaderboard.**
 
-Building with AI is different. Traditional software is deterministic, but AI systems are probabilistic. How do you ship new features without causing silent regressions? How do you prove a new prompt is actually better?
-
-The answer is a new engineering discipline: **Eval-Driven Development (EDD)**. It adapts the rigor of Test-Driven Development for the uncertain world of AI. With EDD, you define your AI's desired behavior as a suite of executable tests, creating a safety net that allows you to innovate with confidence.
-
-EP provides a consistent way to write evals, store traces, and analyze results.
-
-<p align="center">
-	<img src="https://raw.githubusercontent.com/eval-protocol/python-sdk/refs/heads/main/assets/ui.png" alt="UI" />
-	<br>
-	<sub><b>Log Viewer: Monitor your evaluation rollouts in real time.</b></sub>
-</p>
+When you have multiple AI models to choose from—different versions, providers, or configurations—how do you know which one is best for your use case?
 
 ## Quick Example
 
-Here's a simple test function that checks if a model's response contains **bold** text formatting:
+Compare models on a simple formatting task:
 
 ```python test_bold_format.py
 from eval_protocol.models import EvaluateResult, EvaluationRow, Message
-from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test
+from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
 
 @evaluation_test(
     input_messages=[
         [
-            Message(role="system", content="You are a helpful assistant. Use bold text to highlight important information."),
-            Message(role="user", content="Explain why **evaluations** matter for building AI agents. Make it dramatic!"),
+            Message(role="system", content="Use bold text to highlight important information."),
+            Message(role="user", content="Explain why evaluations matter for AI agents. Make it dramatic!"),
         ],
     ],
-    completion_params=[{"model": "accounts/fireworks/models/llama-v3p1-8b-instruct"}],
-    rollout_processor=SingleTurnRolloutProcessor(),
+    model=[
+        "fireworks_ai/accounts/fireworks/models/llama-v3p1-8b-instruct",
+        "openai/gpt-4",
+        "anthropic/claude-3-sonnet"
+    ],
+    rollout_processor=default_single_turn_rollout_processor,
     mode="pointwise",
 )
 def test_bold_format(row: EvaluationRow) -> EvaluationRow:
-    """
-    Simple evaluation that checks if the model's response contains bold text.
-    """
-
+    """Check if the model's response contains bold text."""
     assistant_response = row.messages[-1].content
 
-    # Check if response contains **bold** text
-    has_bold = "**" in assistant_response
+    if assistant_response is None:
+        row.evaluation_result = EvaluateResult(score=0.0, reason="No response")
+        return row
 
-    if has_bold:
-        result = EvaluateResult(score=1.0, reason="✅ Response contains bold text")
-    else:
-        result = EvaluateResult(score=0.0, reason="❌ No bold text found")
+    has_bold = "**" in str(assistant_response)
+    score = 1.0 if has_bold else 0.0
+    reason = "Contains bold text" if has_bold else "No bold text found"
 
-    row.evaluation_result = result
+    row.evaluation_result = EvaluateResult(score=score, reason=reason)
     return row
 ```
 
-## Documentation
+## 📚 Resources
 
-See our [documentation](https://evalprotocol.io) for more details.
+- **[Documentation](https://evalprotocol.io)** - Complete guides and API reference
+- **[Discord](https://discord.com/channels/1137072072808472616/1400975572405850155)** - Community discussions
 
 ## Installation
 
 
@@ -397,15 +397,11 @@ def __iter__(self):
 
 CompletionParams = Dict[str, Any]
 """
-Common set of completion parameters that most model providers support in their
-API. Set total=False to allow extra fields since LiteLLM + providers have their
-own set of parameters. The following parameters are common fields that are
-populated.
-
-model: str
-temperature: Optional[float]
-max_tokens: Optional[int]
-top_p: Optional[float]
+The completion parameters for the respective LLM SDK or agent framework.
+Depending on the rollout processor, this might be the parameters passed to
+LiteLLM completion call or parameters for the "run" method of the "Agent" class
+in Pydantic AI.  You can also customize this dictionary to whatever you need if
+you implement your own custom rollout processor.
 """
 
 
@@ -576,6 +572,13 @@ def get_assistant_messages(self) -> List[Message]:
         """Returns only the assistant messages from the conversation."""
         return [msg for msg in self.messages if msg.role == "assistant"]
 
+    def last_assistant_message(self) -> Optional[Message]:
+        """Returns the last assistant message from the conversation. Returns None if none found."""
+        assistant_messages = self.get_assistant_messages()
+        if not assistant_messages:
+            return None
+        return assistant_messages[-1]
+
     def get_user_messages(self) -> List[Message]:
         """Returns only the user messages from the conversation."""
         return [msg for msg in self.messages if msg.role == "user"]
 
@@ -8,6 +8,15 @@
 from .rollout_processor import RolloutProcessor
 from .types import RolloutProcessorConfig
 
+# Conditional import for optional dependency
+try:
+    from .default_pydantic_ai_rollout_processor import PydanticAgentRolloutProcessor
+
+    PYDANTIC_AI_AVAILABLE = True
+except ImportError:
+    PYDANTIC_AI_AVAILABLE = False
+    PydanticAgentRolloutProcessor = None
+
 __all__ = [
     "AgentRolloutProcessor",
     "MCPGymRolloutProcessor",
@@ -21,3 +30,7 @@
     "BackoffConfig",
     "get_default_exception_handler_config",
 ]
+
+# Only add to __all__ if available
+if PYDANTIC_AI_AVAILABLE:
+    __all__.append("PydanticAgentRolloutProcessor")
@@ -0,0 +1,148 @@
+import asyncio
+import logging
+import types
+from typing import List
+
+from attr import dataclass
+from openai.types.chat.chat_completion_assistant_message_param import ChatCompletionAssistantMessageParam
+
+from eval_protocol.models import EvaluationRow, Message
+from eval_protocol.pytest.rollout_processor import RolloutProcessor
+from eval_protocol.pytest.types import RolloutProcessorConfig
+from openai.types.chat import ChatCompletion, ChatCompletionMessageParam
+from openai.types.chat.chat_completion import Choice as ChatCompletionChoice
+from pydantic_ai.models.anthropic import AnthropicModel
+from pydantic_ai.models.openai import OpenAIModel
+from pydantic_ai.models.google import GoogleModel
+from pydantic import TypeAdapter
+from pydantic_ai.messages import ModelMessage
+from pydantic_ai._utils import generate_tool_call_id
+from pydantic_ai import Agent
+from pydantic_ai.messages import (
+    ModelRequest,
+    SystemPromptPart,
+    ToolReturnPart,
+    UserPromptPart,
+)
+from pydantic_ai.providers.openai import OpenAIProvider
+from typing_extensions import TypedDict
+
+logger = logging.getLogger(__name__)
+
+
+class PydanticAgentRolloutProcessor(RolloutProcessor):
+    """Rollout processor for Pydantic AI agents. Mainly converts
+    EvaluationRow.messages to and from Pydantic AI ModelMessage format."""
+
+    def __init__(self):
+        # dummy model used for its helper functions for processing messages
+        self.util = OpenAIModel("dummy-model", provider=OpenAIProvider(api_key="dummy"))
+
+    def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]:
+        """Create agent rollout tasks and return them for external handling."""
+
+        max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8
+        semaphore = asyncio.Semaphore(max_concurrent)
+
+        # validate that the "agent" field is present with a valid Pydantic AI Agent instance in the completion_params dict
+        if "agent" not in config.kwargs:
+            raise ValueError("kwargs must contain an 'agent' field with a valid Pydantic AI Agent instance")
+        if not isinstance(config.kwargs["agent"], Agent) and not isinstance(
+            config.kwargs["agent"], types.FunctionType
+        ):
+            raise ValueError(
+                "kwargs['agent'] must be a valid Pydantic AI Agent instance or a function that returns an Agent"
+            )
+
+        if isinstance(config.kwargs["agent"], types.FunctionType):
+            setup_agent = config.kwargs["agent"]
+            if not isinstance(config.completion_params["model"], dict):
+                raise ValueError(
+                    "completion_params['model'] must be a dict mapping agent argument names to model config dicts (with 'model' and 'provider' keys)"
+                )
+            kwargs = {}
+            for k, v in config.completion_params["model"].items():
+                if v["model"] and v["model"].startswith("anthropic:"):
+                    kwargs[k] = AnthropicModel(
+                        v["model"].removeprefix("anthropic:"),
+                    )
+                elif v["model"] and v["model"].startswith("google:"):
+                    kwargs[k] = GoogleModel(
+                        v["model"].removeprefix("google:"),
+                    )
+                else:
+                    kwargs[k] = OpenAIModel(
+                        v["model"],
+                        provider=v["provider"],
+                    )
+            agent = setup_agent(**kwargs)
+            model = None
+        else:
+            agent = config.kwargs["agent"]
+            model = OpenAIModel(
+                config.completion_params["model"],
+                provider=config.completion_params["provider"],
+            )
+
+        async def process_row(row: EvaluationRow) -> EvaluationRow:
+            """Process a single row with agent rollout."""
+            model_messages = [self.convert_ep_message_to_pyd_message(m, row) for m in row.messages]
+            response = await agent.run(
+                message_history=model_messages, model=model, usage_limits=config.kwargs.get("usage_limits")
+            )
+            row.messages = await self.convert_pyd_message_to_ep_message(response.all_messages())
+            return row
+
+        async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
+            async with semaphore:
+                result = await process_row(r)
+                return result
+
+        # Create and return tasks for external handling
+        tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows]
+        return tasks
+
+    async def convert_pyd_message_to_ep_message(self, messages: list[ModelMessage]) -> list[Message]:
+        oai_messages: list[ChatCompletionMessageParam] = await self.util._map_messages(messages)
+        return [Message(**m) for m in oai_messages]
+
+    def convert_ep_message_to_pyd_message(self, message: Message, row: EvaluationRow) -> ModelMessage:
+        if message.role == "assistant":
+            type_adapter = TypeAdapter(ChatCompletionAssistantMessageParam)
+            oai_message = type_adapter.validate_python(message)
+            # Fix: Provide required finish_reason and index, and ensure created is int (timestamp)
+            return self.util._process_response(
+                ChatCompletion(
+                    choices=[ChatCompletionChoice(message=oai_message, finish_reason="stop", index=0)],
+                    object="chat.completion",
+                    model="",
+                    id="",
+                    created=(
+                        int(row.created_at.timestamp())
+                        if hasattr(row.created_at, "timestamp")
+                        else int(row.created_at)
+                    ),
+                )
+            )
+        elif message.role == "user":
+            if isinstance(message.content, str):
+                return ModelRequest(parts=[UserPromptPart(content=message.content)])
+            elif isinstance(message.content, list):
+                return ModelRequest(parts=[UserPromptPart(content=message.content[0].text)])
+        elif message.role == "system":
+            if isinstance(message.content, str):
+                return ModelRequest(parts=[SystemPromptPart(content=message.content)])
+            elif isinstance(message.content, list):
+                return ModelRequest(parts=[SystemPromptPart(content=message.content[0].text)])
+        elif message.role == "tool":
+            return ModelRequest(
+                parts=[
+                    ToolReturnPart(
+                        content=message.content,
+                        tool_name="",
+                        tool_call_id=message.tool_call_id or generate_tool_call_id(),
+                    )
+                ]
+            )
+        else:
+            raise ValueError(f"Unknown role: {message.role}")
@@ -61,6 +61,7 @@
     parse_ep_max_concurrent_rollouts,
     parse_ep_num_runs,
     parse_ep_completion_params,
+    parse_ep_passed_threshold,
     rollout_processor_with_retry,
     sanitize_filename,
 )
@@ -538,6 +539,7 @@ def evaluation_test(  # noqa: C901
     max_dataset_rows = parse_ep_max_rows(max_dataset_rows)
     completion_params = parse_ep_completion_params(completion_params)
     original_completion_params = completion_params
+    passed_threshold = parse_ep_passed_threshold(passed_threshold)
 
     def decorator(
         test_func: TestFunction,
@@ -925,16 +927,18 @@ async def _collect_result(config, lst):
                                     r.eval_metadata.status = Status.eval_finished()
                             active_logger.log(r)
 
-                    tasks = []
-                    for i in range(num_runs):
-                        tasks.append(asyncio.create_task(execute_run(i, config)))
-
                     # if rollout_processor is McpGymRolloutProcessor, we execute runs sequentially since McpGym does not support concurrent runs
                     # else, we execute runs in parallel
                     if isinstance(rollout_processor, MCPGymRolloutProcessor):
-                        for task in tasks:
+                        # For MCPGymRolloutProcessor, create and execute tasks one at a time to avoid port conflicts
+                        for i in range(num_runs):
+                            task = asyncio.create_task(execute_run(i, config))
                             await task
                     else:
+                        # For other processors, create all tasks at once and run in parallel
+                        tasks = []
+                        for i in range(num_runs):
+                            tasks.append(asyncio.create_task(execute_run(i, config)))
                         await asyncio.gather(*tasks)
 
                     # for groupwise mode, the result contains eval otuput from multiple completion_params, we need to differentiate them