bugs in retry processor

xzrderek · xzrderek · commit 738634739a7f · 2025-08-14T23:59:28.000-07:00
diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py
@@ -63,13 +63,13 @@ def pytest_addoption(parser) -> None:
         "--ep-max-retry",
         action="store",
         type=int,
-        default=None,
+        default=0,
         help=("Failed rollouts (with rollout_status.status == 'error') will be retried up to this many times."),
     )
     group.addoption(
-        "--ep-fail-on-permanent-failure",
+        "--ep-fail-on-max-retry",
         action="store",
-        default=None,
+        default="true",
         choices=["true", "false"],
         help=(
             "Whether to fail the entire rollout when permanent failures occur after max retries. "
@@ -118,12 +118,10 @@ def pytest_configure(config) -> None:
         os.environ["EP_SUMMARY_JSON"] = summary_json_path
 
     max_retry = config.getoption("--ep-max-retry")
-    if max_retry is not None:
-        os.environ["EP_MAX_RETRY"] = str(max_retry)
+    os.environ["EP_MAX_RETRY"] = str(max_retry)
 
-    fail_on_permanent_failure = config.getoption("--ep-fail-on-permanent-failure")
-    if fail_on_permanent_failure is not None:
-        os.environ["EP_FAIL_ON_PERMANENT_FAILURE"] = fail_on_permanent_failure
+    fail_on_max_retry = config.getoption("--ep-fail-on-max-retry")
+    os.environ["EP_FAIL_ON_MAX_RETRY"] = fail_on_max_retry
 
     # Allow ad-hoc overrides of input params via CLI flags
     try:
diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py
@@ -280,7 +280,13 @@ async def retry_handler(failed_row: EvaluationRow):
 
         async def initial_processor():
             """Process initial batch and spawn retries for failures"""
-            base_tasks = rollout_processor(fresh_dataset, config)
+            # catch any task creation errors and raise them immediately, i.e. port already in use
+            try:
+                base_tasks = rollout_processor(fresh_dataset, config)
+            except Exception as e:
+                print(f"❌ Rollout processor failed to initialize: {e}")
+                raise e
+
             pending = set(base_tasks)
 
             while pending:
@@ -310,7 +316,7 @@ async def initial_processor():
 
             # only permanent failure rows are put on the queue, so we can check for them here
             if finished_row.rollout_status and finished_row.rollout_status.status == "error":
-                if os.getenv("EP_FAIL_ON_PERMANENT_FAILURE", "true") != "false":
+                if max_retry > 0 and os.getenv("EP_FAIL_ON_MAX_RETRY", "true") != "false":
                     raise RuntimeError(
                         f"Rollout {finished_row.execution_metadata.rollout_id} failed after {max_retry} retries. Errors: {finished_row.rollout_status.termination_reason}"
                     )
diff --git a/tests/pytest/test_livesvgbench.py b/tests/pytest/test_livesvgbench.py
@@ -427,7 +427,6 @@ def test_svg_combined_evaluation(row: EvaluationRow) -> EvaluationRow:
 
     Combines results to catch issues like Google logos that are just colored circles.
     """
-    logger.info(f"Evaluating row {row.input_metadata.row_id} at {time.time()}")
     # Extract dataset info
     requirements = row.input_metadata.dataset_info["requirements"]
     total_requirements = row.input_metadata.dataset_info["total_requirements"]