Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion eval_protocol/pytest/default_agent_rollout_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
async def process_row(row: EvaluationRow) -> EvaluationRow:
"""Process a single row with agent rollout."""
agent = Agent(
model=config.completion_params["model"],
model=row.input_metadata.completion_params["model"],
row=row,
config_path=config.mcp_config_path,
logger=config.logger,
Expand Down
482 changes: 294 additions & 188 deletions eval_protocol/pytest/evaluation_test.py

Large diffs are not rendered by default.

11 changes: 4 additions & 7 deletions eval_protocol/pytest/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,11 @@

Dataset = List[EvaluationRow]

EvaluationTestMode = Literal["batch", "pointwise"]
EvaluationTestMode = Literal["pointwise", "groupwise", "all"]
"""
"batch": (default) expects test function to handle full dataset.
"pointwise": applies test function to each row.

How to choose between "batch" and "pointwise":
If your evaluation requires the rollout of all rows to be passed into your eval compute the score, use "batch".
If your evaluation can be computed pointwise, use "pointwise" as EP can pipeline the rollouts and evals to be faster.
"pointwise": (default) applies test function to each row (rollout result).
"groupwise": applies test function to a group of rollout results from the same original row (for use cases such as dpo/grpo).
"all": applies test function to the whole dataset.
"""

"""
Expand Down
1 change: 1 addition & 0 deletions tests/pytest/test_pytest_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
],
],
completion_params=[{"model": "accounts/fireworks/models/kimi-k2-instruct"}],
mode="all",
)
async def test_pytest_async(rows: List[EvaluationRow]) -> List[EvaluationRow]:
"""Run math evaluation on sample dataset using pytest interface."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
],
rollout_processor=AgentRolloutProcessor(),
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}],
mode="all",
)
def test_pytest_default_agent_rollout_processor(rows: List[EvaluationRow]) -> List[EvaluationRow]:
"""Run math evaluation on sample dataset using pytest interface."""
Expand Down
28 changes: 28 additions & 0 deletions tests/pytest/test_pytest_groupwise.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from typing import List

from eval_protocol.models import EvaluationRow, Message, EvaluateResult
from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test


@evaluation_test(
input_messages=[
[
Message(role="user", content="What is the capital of France?"),
]
],
completion_params=[
{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"},
{"model": "fireworks_ai/accounts/fireworks/models/gpt-4.1"},
],
rollout_processor=SingleTurnRolloutProcessor(),
mode="groupwise",
)
def test_pytest_groupwise(rows: List[EvaluationRow]) -> List[EvaluationRow]:
"""Run math evaluation on sample dataset using pytest interface."""
assert rows[0].input_metadata.completion_params["model"] == "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"
assert rows[1].input_metadata.completion_params["model"] == "fireworks_ai/accounts/fireworks/models/gpt-4.1"
rows[0].evaluation_result = EvaluateResult(score=1.0, reason="test")
rows[1].evaluation_result = EvaluateResult(score=0.0, reason="test")
print(rows[0].model_dump_json())
print(rows[1].model_dump_json())
return rows
1 change: 1 addition & 0 deletions tests/pytest/test_pytest_input_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
],
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
rollout_processor=SingleTurnRolloutProcessor(),
mode="all",
)
def test_input_messages_in_decorator(rows: List[EvaluationRow]) -> List[EvaluationRow]:
"""Run math evaluation on sample dataset using pytest interface."""
Expand Down
177 changes: 177 additions & 0 deletions tests/pytest/test_svgbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,74 @@ def evaluate_with_llm_judge(image_path: str, requirements: List[str]) -> Dict[st
raise ValueError("Missing required field in response")


def evaluate_with_llm_judge_groupwise(image_paths: List[str], requirements: List[str]) -> Dict[str, Any]:
"""
Use LLM judge to evaluate how many requirements are fulfilled.
Uses GPT-4.1 for vision capabilities to match project's model preferences. (note original repo uses Gemini 2.5 flashs)

Args:
image_path: Path to rendered PNG image
requirements: List of requirements to evaluate

Returns:
Dictionary with evaluation results
"""
# Format requirements for evaluation (exactly as in original)
requirements_text = "\n".join([f"{i + 1}. {req}" for i, req in enumerate(requirements)])

# Create evaluation prompt with JSON response format
evaluate_prompt = f"""Examine the generated images you are given. Based on the following {len(requirements)} requirements, which one is better?

Respond ONLY with a JSON object in this exact format:
{{"best_image_index": <index>, "reasoning": <reasoning_text>}}

Requirements:
{requirements_text}"""

messages = [
{
"role": "user",
"content": [
{"type": "text", "text": evaluate_prompt},
],
}
]

# Read and encode image
for image_path in image_paths:
with open(image_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode("utf-8")
messages[0]["content"].append(
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_data}"}}
)

# Use GPT-4.1 for vision capabilities to match project's OpenAI model preference
response = litellm.completion(
model="gpt-4.1",
messages=messages,
temperature=0.0,
response_format={
"type": "json_schema",
"json_schema": {"name": "SVGBenchResponse", "schema": SVGBenchResponse.model_json_schema()},
},
)

# Parse response
response_content = response.choices[0].message.content

# Handle empty response
if not response_content or response_content.strip() == "":
raise ValueError("Empty response from LLM judge")

result = json.loads(response_content)

# Validate the result
if "best_image_index" in result:
return result
else:
raise ValueError("Missing required field in response")


@evaluation_test(
input_dataset=["tests/pytest/data/svgbench_dataset.jsonl"],
dataset_adapter=svgbench_to_evaluation_row,
Expand All @@ -279,6 +347,7 @@ def evaluate_with_llm_judge(image_path: str, requirements: List[str]) -> Dict[st
passed_threshold=0.5, # 50% average score to pass
num_runs=1,
mode="pointwise",
max_dataset_rows=1,
max_concurrent_rollouts=50,
)
def test_svg_generation_evaluation(row: EvaluationRow) -> EvaluationRow:
Expand Down Expand Up @@ -378,3 +447,111 @@ def test_svg_generation_evaluation(row: EvaluationRow) -> EvaluationRow:
os.unlink(png_path)
except Exception:
pass


@evaluation_test(
input_dataset=["tests/pytest/data/svgbench_dataset.jsonl"],
dataset_adapter=svgbench_to_evaluation_row,
completion_params=[
{"temperature": 0.0, "model": "gpt-4.1"},
{
"temperature": 0.8,
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
"extra_body": {"reasoning_effort": "high"},
},
],
rollout_processor=SingleTurnRolloutProcessor(),
passed_threshold=None,
num_runs=1,
max_dataset_rows=3,
mode="groupwise",
max_concurrent_rollouts=50,
)
def test_svg_generation_evaluation_groupwise(rows: List[EvaluationRow]) -> List[EvaluationRow]:
"""
Test SVG generation and evaluation using SVGBench methodology.

This test:
1. Extracts SVG code from the model's response
2. Renders SVG to PNG using Selenium
3. Uses LLM judge to evaluate requirement fulfillment
4. Calculates score based on fulfilled requirements ratio

Args:
row: EvaluationRow with model's SVG generation response

Returns:
EvaluationRow with evaluation results
"""
# Extract dataset info
image_paths = []
requirements = rows[0].input_metadata.dataset_info["requirements"]
for row in rows:
row_id = row.input_metadata.row_id

# Check if we should save debug files
save_debug_files = os.environ.get("SVGBENCH_SAVE_DEBUG_FILES", "false").lower() == "true"

# Get model response
if not row.messages or len(row.messages) < 2:
row.evaluation_result = EvaluateResult(score=0.0, reason="No model response found")
continue

model_response = row.messages[-1].content

# Extract SVG code with better error reporting (matching original)
try:
svg_code = extract_svg_code(model_response)
if not svg_code:
raise ValueError("No valid SVG code found in response")
except Exception as e:
logger.error(f"Error extracting SVG code for question {row_id}: {e}")
if save_debug_files:
logger.error(f"Full response: {model_response}")

row.evaluation_result = EvaluateResult(score=0.0, reason=f"SVG extraction failed: {str(e)}")
continue

# Setup file paths
if save_debug_files:
# Create debug directory
model = row.input_metadata.completion_params["model"]
# Sanitize model name for filesystem (replace slashes with underscores)
safe_model_name = model.replace("/", "_").replace(":", "_")
debug_dir = "svgbench_debug"
os.makedirs(debug_dir, exist_ok=True)
png_path = os.path.join(debug_dir, f"question_{row_id}_{safe_model_name}.png")
svg_path = os.path.join(debug_dir, f"question_{row_id}_{safe_model_name}.svg")
# Save SVG file for debugging
with open(svg_path, "w") as f:
f.write(svg_code)
else:
# Use temporary file
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
png_path = f.name
image_paths.append(png_path)
try:
# Render SVG to PNG
if not render_svg_to_png(svg_code, png_path):
row.evaluation_result = EvaluateResult(score=0.0, reason="Failed to render SVG to PNG")

except Exception as e:
logger.error(f"Evaluation failed for question {row_id}: {e}")
row.evaluation_result = EvaluateResult(score=0.0, reason=f"Evaluation error: {str(e)}")

judge_result = evaluate_with_llm_judge_groupwise(image_paths, requirements)
print(f"********** judge_result: {judge_result} **********")
if judge_result.get("best_image_index") == 0:
rows[0].evaluation_result = EvaluateResult(score=1.0, reason=judge_result.get("reasoning", ""))
rows[1].evaluation_result = EvaluateResult(score=0.0, reason=judge_result.get("reasoning", ""))
else:
rows[0].evaluation_result = EvaluateResult(score=0.0, reason=judge_result.get("reasoning", ""))
rows[1].evaluation_result = EvaluateResult(score=1.0, reason=judge_result.get("reasoning", ""))

# Clean up temporary PNG file (only if not saving debug files)
if not save_debug_files:
for png_path in image_paths:
if os.path.exists(png_path):
os.unlink(png_path)

return rows
Loading