Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ jobs:
--ignore=tests/pytest/test_apps_coding.py \
--ignore=tests/test_tau_bench_airline_smoke.py \
--ignore=tests/pytest/test_svgbench.py \
--ignore=tests/pytest/test_livesvgbench.py \
--cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10

- name: Store coverage file
Expand Down
1 change: 1 addition & 0 deletions eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,7 @@ async def _execute_with_semaphore(row):
for result in all_results:
for r in result:
if r.eval_metadata is not None:
r.eval_metadata.status = "finished"
r.eval_metadata.passed = passed
active_logger.log(r)

Expand Down
14 changes: 6 additions & 8 deletions eval_protocol/pytest/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,13 @@ def pytest_addoption(parser) -> None:
"--ep-max-retry",
action="store",
type=int,
default=None,
default=0,
help=("Failed rollouts (with rollout_status.status == 'error') will be retried up to this many times."),
)
group.addoption(
"--ep-fail-on-permanent-failure",
"--ep-fail-on-max-retry",
action="store",
default=None,
default="true",
choices=["true", "false"],
help=(
"Whether to fail the entire rollout when permanent failures occur after max retries. "
Expand Down Expand Up @@ -118,12 +118,10 @@ def pytest_configure(config) -> None:
os.environ["EP_SUMMARY_JSON"] = summary_json_path

max_retry = config.getoption("--ep-max-retry")
if max_retry is not None:
os.environ["EP_MAX_RETRY"] = str(max_retry)
os.environ["EP_MAX_RETRY"] = str(max_retry)

fail_on_permanent_failure = config.getoption("--ep-fail-on-permanent-failure")
if fail_on_permanent_failure is not None:
os.environ["EP_FAIL_ON_PERMANENT_FAILURE"] = fail_on_permanent_failure
fail_on_max_retry = config.getoption("--ep-fail-on-max-retry")
os.environ["EP_FAIL_ON_MAX_RETRY"] = fail_on_max_retry

# Allow ad-hoc overrides of input params via CLI flags
try:
Expand Down
10 changes: 8 additions & 2 deletions eval_protocol/pytest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,13 @@ async def retry_handler(failed_row: EvaluationRow):

async def initial_processor():
"""Process initial batch and spawn retries for failures"""
base_tasks = rollout_processor(fresh_dataset, config)
# catch any task creation errors and raise them immediately, i.e. port already in use
try:
base_tasks = rollout_processor(fresh_dataset, config)
except Exception as e:
print(f"❌ Rollout processor failed to initialize: {e}")
raise e

pending = set(base_tasks)

while pending:
Expand Down Expand Up @@ -310,7 +316,7 @@ async def initial_processor():

# only permanent failure rows are put on the queue, so we can check for them here
if finished_row.rollout_status and finished_row.rollout_status.status == "error":
if os.getenv("EP_FAIL_ON_PERMANENT_FAILURE", "true") != "false":
if max_retry > 0 and os.getenv("EP_FAIL_ON_MAX_RETRY", "true") != "false":
raise RuntimeError(
f"Rollout {finished_row.execution_metadata.rollout_id} failed after {max_retry} retries. Errors: {finished_row.rollout_status.termination_reason}"
)
Expand Down
210 changes: 105 additions & 105 deletions tests/pytest/data/svgbench_dataset.jsonl

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions tests/pytest/data/svgbench_sample_dataset.jsonl
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{"requirements": ["Cow must be clearly recognizable with distinctive bovine features", "Include cow body, head, four legs, tail, and udder", "Add cow ears, eyes, and snout for facial recognition", "Cow should be positioned in a realistic plowing stance (pulling forward)", "Use appropriate cow coloring (black/white patches, brown, or solid color)", "Include a traditional plow with visible blade/share", "Show plow handles extending upward", "Depict connection mechanism between cow and plow (yoke, harness, or chains)", "Plow should appear to be cutting into the soil", "Show ground/soil with visible furrows behind the plow", "Include plowed and unplowed sections of field", "Add simple background elements (horizon line, sky)", "Include basic vegetation or crops"], "prompt": "Write `svg` code to draw an image of a cow plowing a field.", "id": "cow_plowing"}
{"requirements": ["The overall background of the SVG must be white", "All primary elements must be horizontally centered on the canvas", "Include the Google logo in the center, using its official multi-color scheme (blue, red, yellow, blue, green, red)", "Place a prominent search bar directly below the Google logo", "The search bar must be a rounded rectangle with a light gray border", "The search bar must contain a gray magnifying glass icon on the left side", "The search bar must contain a gray microphone icon on the right side", "Place two distinct buttons below the search bar", "The left button must be labeled 'Google Search'", "The right button must be labeled 'I'm Feeling Lucky'", "Buttons should have a light gray background, a thin border, and dark gray text", "Create a header section at the top right of the canvas", "The header must include text links for 'Gmail' and 'Images'", "The header must include a 3x3 grid icon (Google Apps launcher)", "The header must include a prominent 'Sign in' button, typically with a blue background and white text"], "prompt": "Write `svg` code for a screenshot of the [Google homepage](https://google.com).", "id": "google_homepage"}
{"requirements": ["Create a primary circular or elliptical shape for the top surface of a round dinner table", "The table should have a distinct color or a simple texture like wood grain", "Include exactly 4 sets of cutlery arranged around the table", "Each cutlery set must consist of a recognizable fork, knife, and spoon", "Position the 4 cutlery sets at distinct place settings (e.g., at 12, 3, 6, and 9 o'clock positions)", "Optionally, include a round dinner plate at each of the 4 place settings", "Place exactly 3 main food dishes on the surface of the table", "First dish: A recognizable roasted turkey, golden-brown in color, showing drumsticks and a plump body", "The turkey should be presented on its own platter or serving dish", "Second dish: A round pizza, cut into slices, with visible crust and toppings", "Third dish: A serving of tacos (at least two), with visible folded shells and fillings (e.g., lettuce, meat, cheese)", "The tacos should be on a plate or in a holder", "Arrange the three main dishes in the center of the table, ensuring they don't unnaturally overlap", "The overall perspective should be top-down or slightly isometric"], "prompt": "Write `svg` code for an image of a round dinner table with 4 sets of cutlery and 3 dishes on the table, including a turkey, pizza and tacos.", "id": "dinner_table"}
{"requirements": ["Cow must be clearly recognizable with distinctive bovine features", "Include cow body, head, four legs, tail, and udder", "Add cow ears, eyes, and snout for facial recognition", "Cow should be positioned in a realistic plowing stance (pulling forward)", "Use appropriate cow coloring (black/white patches, brown, or solid color)", "Include a traditional plow with visible blade/share", "Show plow handles extending upward", "Depict connection mechanism between cow and plow (yoke, harness, or chains)", "Plow should appear to be cutting into the soil", "Show ground/soil with visible furrows behind the plow", "Include plowed and unplowed sections of field", "Add simple background elements (horizon line, sky)", "Include basic vegetation or crops"], "prompt": "Write `svg` code to draw an image of a cow plowing a field."}
{"requirements": ["The overall background of the SVG must be white", "All primary elements must be horizontally centered on the canvas", "Include the Google logo in the center, using its official multi-color scheme (blue, red, yellow, blue, green, red)", "Place a prominent search bar directly below the Google logo", "The search bar must be a rounded rectangle with a light gray border", "The search bar must contain a gray magnifying glass icon on the left side", "The search bar must contain a gray microphone icon on the right side", "Place two distinct buttons below the search bar", "The left button must be labeled 'Google Search'", "The right button must be labeled 'I'm Feeling Lucky'", "Buttons should have a light gray background, a thin border, and dark gray text", "Create a header section at the top right of the canvas", "The header must include text links for 'Gmail' and 'Images'", "The header must include a 3x3 grid icon (Google Apps launcher)", "The header must include a prominent 'Sign in' button, typically with a blue background and white text"], "prompt": "Write `svg` code for a screenshot of the [Google homepage](https://google.com)."}
{"requirements": ["Create a primary circular or elliptical shape for the top surface of a round dinner table", "The table should have a distinct color or a simple texture like wood grain", "Include exactly 4 sets of cutlery arranged around the table", "Each cutlery set must consist of a recognizable fork, knife, and spoon", "Position the 4 cutlery sets at distinct place settings (e.g., at 12, 3, 6, and 9 o'clock positions)", "Optionally, include a round dinner plate at each of the 4 place settings", "Place exactly 3 main food dishes on the surface of the table", "First dish: A recognizable roasted turkey, golden-brown in color, showing drumsticks and a plump body", "The turkey should be presented on its own platter or serving dish", "Second dish: A round pizza, cut into slices, with visible crust and toppings", "Third dish: A serving of tacos (at least two), with visible folded shells and fillings (e.g., lettuce, meat, cheese)", "The tacos should be on a plate or in a holder", "Arrange the three main dishes in the center of the table, ensuring they don't unnaturally overlap", "The overall perspective should be top-down or slightly isometric"], "prompt": "Write `svg` code for an image of a round dinner table with 4 sets of cutlery and 3 dishes on the table, including a turkey, pizza and tacos."}
Loading
Loading