more complex example

Dylan Huang · Dylan Huang · commit f04e52acf6f9 · 2025-08-26T14:47:58.000-07:00
diff --git a/tests/chinook/agent.py b/tests/chinook/agent.py
@@ -26,10 +26,33 @@ def setup_agent(orchestrator_agent_model: Model):
     )
 
     @agent.tool(retries=5)
-    def execute_sql(ctx: RunContext, query: str) -> tuple[any, ...]:
+    def execute_sql(ctx: RunContext, query: str) -> dict:
         try:
             cursor.execute(query)
-            return cursor.fetchall()
+            # Get column headers from cursor description
+            columns = [desc[0] for desc in cursor.description] if cursor.description else []
+            # Get data rows
+            rows = cursor.fetchall()
+
+            if not columns or not rows:
+                return "No results found."
+
+            # Create markdown table
+            table_lines = []
+
+            # Header row
+            table_lines.append("| " + " | ".join(columns) + " |")
+
+            # Separator row
+            table_lines.append("| " + " | ".join(["---"] * len(columns)) + " |")
+
+            # Data rows
+            for row in rows:
+                # Convert all values to strings and escape pipes
+                formatted_row = [str(cell).replace("|", "\\|") if cell is not None else "" for cell in row]
+                table_lines.append("| " + " | ".join(formatted_row) + " |")
+
+            return "\n".join(table_lines)
         except Exception as e:
             connection.rollback()
             raise ModelRetry("Please try again with a different query. Here is the error: " + str(e))
diff --git a/tests/chinook/dataset.py b/tests/chinook/dataset.py
@@ -0,0 +1,50 @@
+from typing import List
+import os
+import glob
+
+from eval_protocol.models import EvaluationRow, Message
+
+
+def collect_dataset() -> List[EvaluationRow]:
+    """
+    Iterate through the dataset folder and create EvaluationRow objects.
+
+    For each folder named "task_<n>", reads "task.txt" and "ground_truth.md"
+    and creates an EvaluationRow where:
+    - messages contains a user message with the task content
+    - ground_truth contains the contents of ground_truth.md
+    """
+    dataset_rows = []
+    dataset_path = os.path.join(os.path.dirname(__file__), "dataset")
+
+    # Find all task folders (task_<n>)
+    task_folders = glob.glob(os.path.join(dataset_path, "task_*"))
+
+    for task_folder in sorted(task_folders):
+        task_name = os.path.basename(task_folder)
+
+        # Read task.txt
+        task_file = os.path.join(task_folder, "task.txt")
+        if not os.path.exists(task_file):
+            raise FileNotFoundError(f"Task file not found: {task_file}")
+
+        with open(task_file, "r", encoding="utf-8") as f:
+            task_content = f.read().strip()
+
+        # Read ground_truth.md
+        ground_truth_file = os.path.join(task_folder, "ground_truth.md")
+        if not os.path.exists(ground_truth_file):
+            raise FileNotFoundError(f"Ground truth file not found: {ground_truth_file}")
+
+        with open(ground_truth_file, "r", encoding="utf-8") as f:
+            ground_truth_content = f.read().strip()
+
+        # Create user message with the task
+        user_message = Message(role="user", content=task_content)
+
+        # Create EvaluationRow
+        evaluation_row = EvaluationRow(messages=[user_message], ground_truth=ground_truth_content)
+
+        dataset_rows.append(evaluation_row)
+
+    return dataset_rows
diff --git a/tests/chinook/dataset/task_1/ground_truth.md b/tests/chinook/dataset/task_1/ground_truth.md
@@ -0,0 +1,7 @@
+| customer_name      | favorite_genre | total_invoices | total_spent | spending_rank |
+| ------------------ | -------------- | -------------- | ----------- | ------------- |
+| Helena Holý        | Rock           | 7              | 49.62       | 1             |
+| Richard Cunningham | Rock           | 7              | 47.62       | 2             |
+| Luis Rojas         | Rock           | 7              | 46.62       | 3             |
+| Ladislav Kovács    | Rock           | 7              | 45.62       | 4             |
+| Hugh O'Reilly      | Rock           | 7              | 45.62       | 4             |
diff --git a/tests/chinook/dataset/task_1/task.txt b/tests/chinook/dataset/task_1/task.txt
@@ -0,0 +1 @@
+Find the top 5 customers by total spending, including their favorite genre. Show customer name, favorite genre, total invoices, total spent, and spending rank.
diff --git a/tests/chinook/dataset/task_5/ground_truth.md b/tests/chinook/dataset/task_5/ground_truth.md
@@ -0,0 +1,26 @@
+| genre_name         | usa_revenue | canada_revenue | germany_revenue | france_revenue | brazil_revenue | total_revenue | total_unique_customers | percentage_of_total |
+| ------------------ | ----------- | -------------- | --------------- | -------------- | -------------- | ------------- | ---------------------- | ------------------- |
+| Rock               | 1526.14     | 989.03         | 577.17          | 525.73         | 691.02         | 4309.09       | 35                     | 35.47               |
+| Latin              | 754.67      | 510.84         | 163.35          | 265.32         | 501.93         | 2196.11       | 33                     | 18.08               |
+| Metal              | 554.45      | 335.61         | 189.09          | 192.06         | 74.25          | 1345.46       | 34                     | 11.07               |
+| Alternative & Punk | 415.92      | 300.96         | 105.12          | 207.90         | 71.28          | 1101.18       | 29                     | 9.06                |
+| Jazz               | 202.95      | 126.72         | 27.72           | 139.59         | 0              | 496.98        | 19                     | 4.09                |
+| Blues              | 126.72      | 15.84          | 97.02           | 11.88          | 30.69          | 282.15        | 13                     | 2.32                |
+| TV Shows           | 191.70      | 3.98           | 44.73           | 16.86          | 0              | 257.27        | 10                     | 2.12                |
+| Reggae             | 73.26       | 58.41          | 0               | 13.86          | 83.16          | 228.69        | 8                      | 1.88                |
+| Soundtrack         | 45.54       | 0              | 59.40           | 54.45          | 55.44          | 214.83        | 7                      | 1.77                |
+| Drama              | 143.16      | 13.89          | 14.91           | 38.69          | 0              | 210.65        | 7                      | 1.73                |
+| Classical          | 87.20       | 24.75          | 0               | 57.48          | 39.60          | 209.03        | 9                      | 1.72                |
+| R&B/Soul           | 75.28       | 69.30          | 0               | 0              | 29.70          | 174.28        | 10                     | 1.43                |
+| Alternative        | 79.30       | 0              | 0.99            | 67.44          | 0              | 147.73        | 3                      | 1.22                |
+| Hip Hop/Rap        | 6.93        | 57.45          | 0               | 33.72          | 27.72          | 125.82        | 7                      | 1.04                |
+| Pop                | 33.66       | 0              | 13.86           | 27.72          | 36.63          | 111.87        | 7                      | 0.92                |
+| World              | 0           | 83.16          | 0               | 0              | 27.72          | 110.88        | 5                      | 0.91                |
+| Heavy Metal        | 50.49       | 0              | 41.58           | 0              | 0              | 92.07         | 3                      | 0.76                |
+| Comedy             | 90.44       | 0              | 0               | 0              | 0              | 90.44         | 3                      | 0.74                |
+| Sci Fi & Fantasy   | 71.62       | 0              | 0               | 7.96           | 7.96           | 87.54         | 4                      | 0.72                |
+| Bossa Nova         | 43.56       | 28.71          | 0               | 13.86          | 0              | 86.13         | 7                      | 0.71                |
+| Rock And Roll      | 41.58       | 27.72          | 0               | 13.86          | 0              | 83.16         | 4                      | 0.68                |
+| Electronica/Dance  | 0           | 43.59          | 0               | 33.72          | 0              | 77.31         | 3                      | 0.64                |
+| Easy Listening     | 41.58       | 0              | 27.72           | 0              | 0              | 69.30         | 2                      | 0.57                |
+| Science Fiction    | 10.91       | 0              | 29.82           | 0              | 0              | 40.73         | 2                      | 0.34                |
diff --git a/tests/chinook/dataset/task_5/task.txt b/tests/chinook/dataset/task_5/task.txt
@@ -0,0 +1 @@
+Create a genre popularity matrix by country. Show genre name, revenue by country (USA, Canada, Germany, France, Brazil), total revenue, unique customers, and revenue percentage of total sales.
diff --git a/tests/chinook/test_pydantic_chinook.py b/tests/chinook/test_pydantic_chinook.py
@@ -9,6 +9,8 @@
 from agent import setup_agent
 from pydantic_ai.models.openai import OpenAIModel
 
+from tests.chinook.dataset import collect_dataset
+
 
 @pytest.mark.asyncio
 @evaluation_test(
@@ -75,3 +77,73 @@ class Response(BaseModel):
             reason=result.output.reason,
         )
     return row
+
+
+@pytest.mark.asyncio
+@evaluation_test(
+    input_rows=collect_dataset(),
+    completion_params=[
+        {
+            "model": {
+                "orchestrator_agent_model": {
+                    "model": "accounts/fireworks/models/kimi-k2-instruct",
+                    "provider": "fireworks",
+                }
+            }
+        },
+    ],
+    rollout_processor=PydanticAgentRolloutProcessor(),
+    rollout_processor_kwargs={"agent": setup_agent},
+    num_runs=3,
+    mode="pointwise",
+)
+async def test_complex_queries(row: EvaluationRow) -> EvaluationRow:
+    """
+    Complex queries for the Chinook database
+    """
+    last_assistant_message = row.last_assistant_message()
+    if last_assistant_message is None:
+        row.evaluation_result = EvaluateResult(
+            score=0.0,
+            reason="No assistant message found",
+        )
+    elif not last_assistant_message.content:
+        row.evaluation_result = EvaluateResult(
+            score=0.0,
+            reason="No assistant message found",
+        )
+    else:
+        model = OpenAIModel(
+            "accounts/fireworks/models/llama-v3p1-8b-instruct",
+            provider="fireworks",
+        )
+
+        class Response(BaseModel):
+            """
+            A score between 0.0 and 1.0 indicating whether the response is correct.
+            """
+
+            score: float
+
+            """
+            A short explanation of why the response is correct or incorrect.
+            """
+            reason: str
+
+        comparison_agent = Agent(
+            system_prompt=(
+                "Your job is to compare the response to the expected answer."
+                "If the response is correct, return 1.0. If the response is incorrect, return 0.0."
+            ),
+            output_type=Response,
+            model=model,
+        )
+        result = await comparison_agent.run(
+            f"Expected answer: {row.ground_truth}\nResponse: {last_assistant_message.content}"
+        )
+        row.evaluation_result = EvaluateResult(
+            score=result.output.score,
+            reason=result.output.reason,
+        )
+    return row
+    return row

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Find the top 5 customers by total spending, including their favorite genre. Show customer name, favorite genre, total invoices, total spent, and spending rank.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Create a genre popularity matrix by country. Show genre name, revenue by country (USA, Canada, Germany, France, Brazil), total revenue, unique customers, and revenue percentage of total sales.`