File tree Expand file tree Collapse file tree 3 files changed +14
-11
lines changed
Expand file tree Collapse file tree 3 files changed +14
-11
lines changed Original file line number Diff line number Diff line change @@ -63,13 +63,13 @@ def pytest_addoption(parser) -> None:
6363 "--ep-max-retry" ,
6464 action = "store" ,
6565 type = int ,
66- default = None ,
66+ default = 0 ,
6767 help = ("Failed rollouts (with rollout_status.status == 'error') will be retried up to this many times." ),
6868 )
6969 group .addoption (
70- "--ep-fail-on-permanent-failure " ,
70+ "--ep-fail-on-max-retry " ,
7171 action = "store" ,
72- default = None ,
72+ default = "true" ,
7373 choices = ["true" , "false" ],
7474 help = (
7575 "Whether to fail the entire rollout when permanent failures occur after max retries. "
@@ -118,12 +118,10 @@ def pytest_configure(config) -> None:
118118 os .environ ["EP_SUMMARY_JSON" ] = summary_json_path
119119
120120 max_retry = config .getoption ("--ep-max-retry" )
121- if max_retry is not None :
122- os .environ ["EP_MAX_RETRY" ] = str (max_retry )
121+ os .environ ["EP_MAX_RETRY" ] = str (max_retry )
123122
124- fail_on_permanent_failure = config .getoption ("--ep-fail-on-permanent-failure" )
125- if fail_on_permanent_failure is not None :
126- os .environ ["EP_FAIL_ON_PERMANENT_FAILURE" ] = fail_on_permanent_failure
123+ fail_on_max_retry = config .getoption ("--ep-fail-on-max-retry" )
124+ os .environ ["EP_FAIL_ON_MAX_RETRY" ] = fail_on_max_retry
127125
128126 # Allow ad-hoc overrides of input params via CLI flags
129127 try :
Original file line number Diff line number Diff line change @@ -280,7 +280,13 @@ async def retry_handler(failed_row: EvaluationRow):
280280
281281 async def initial_processor ():
282282 """Process initial batch and spawn retries for failures"""
283- base_tasks = rollout_processor (fresh_dataset , config )
283+ # catch any task creation errors and raise them immediately, i.e. port already in use
284+ try :
285+ base_tasks = rollout_processor (fresh_dataset , config )
286+ except Exception as e :
287+ print (f"❌ Rollout processor failed to initialize: { e } " )
288+ raise e
289+
284290 pending = set (base_tasks )
285291
286292 while pending :
@@ -310,7 +316,7 @@ async def initial_processor():
310316
311317 # only permanent failure rows are put on the queue, so we can check for them here
312318 if finished_row .rollout_status and finished_row .rollout_status .status == "error" :
313- if os .getenv ("EP_FAIL_ON_PERMANENT_FAILURE " , "true" ) != "false" :
319+ if max_retry > 0 and os .getenv ("EP_FAIL_ON_MAX_RETRY " , "true" ) != "false" :
314320 raise RuntimeError (
315321 f"Rollout { finished_row .execution_metadata .rollout_id } failed after { max_retry } retries. Errors: { finished_row .rollout_status .termination_reason } "
316322 )
Original file line number Diff line number Diff line change @@ -427,7 +427,6 @@ def test_svg_combined_evaluation(row: EvaluationRow) -> EvaluationRow:
427427
428428 Combines results to catch issues like Google logos that are just colored circles.
429429 """
430- logger .info (f"Evaluating row { row .input_metadata .row_id } at { time .time ()} " )
431430 # Extract dataset info
432431 requirements = row .input_metadata .dataset_info ["requirements" ]
433432 total_requirements = row .input_metadata .dataset_info ["total_requirements" ]
You can’t perform that action at this time.
0 commit comments