5454 parse_ep_max_rows ,
5555 parse_ep_max_concurrent_rollouts ,
5656 parse_ep_num_runs ,
57+ parse_ep_completion_params ,
5758 rollout_processor_with_retry ,
5859 sanitize_filename ,
5960)
@@ -334,10 +335,14 @@ def evaluation_test( # noqa: C901
334335
335336 active_logger : DatasetLogger = logger if logger else default_logger
336337
337- # Apply override from pytest flags if present
338+ # Optional global overrides via environment for ad-hoc experimentation
339+ # EP_INPUT_PARAMS_JSON can contain a JSON object that will be deep-merged
340+ # into input_params (e.g., '{"temperature":0,"extra_body":{"reasoning":{"effort":"low"}}}').
338341 num_runs = parse_ep_num_runs (num_runs )
339342 max_concurrent_rollouts = parse_ep_max_concurrent_rollouts (max_concurrent_rollouts )
340343 max_dataset_rows = parse_ep_max_rows (max_dataset_rows )
344+ completion_params = parse_ep_completion_params (completion_params )
345+ original_completion_params = completion_params
341346
342347 def decorator (
343348 test_func : TestFunction ,
@@ -420,9 +425,6 @@ async def execute_with_params(
420425 else :
421426 return test_func (** kwargs )
422427
423- # preserve the original completion_params list for groupwise mode
424- original_completion_params_list = completion_params
425-
426428 # Calculate all possible combinations of parameters
427429 if mode == "groupwise" :
428430 combinations = generate_parameter_combinations (
@@ -544,20 +546,6 @@ def _log_eval_error(status: Status, rows: Optional[List[EvaluationRow]] | None,
544546 "No model provided. Please provide a model in the completion parameters object."
545547 )
546548
547- # Optional global overrides via environment for ad-hoc experimentation
548- # EP_INPUT_PARAMS_JSON can contain a JSON object that will be deep-merged
549- # into input_params (e.g., '{"temperature":0,"extra_body":{"reasoning":{"effort":"low"}}}').
550- try :
551- import json as _json
552-
553- _env_override = os .getenv ("EP_INPUT_PARAMS_JSON" )
554- if _env_override :
555- override_obj = _json .loads (_env_override )
556- if isinstance (override_obj , dict ):
557- completion_params = deep_update_dict (dict (completion_params ), override_obj )
558- except Exception :
559- pass
560-
561549 # Create eval metadata with test function info and current commit hash
562550 eval_metadata = EvalMetadata (
563551 name = test_func .__name__ ,
@@ -661,7 +649,7 @@ async def _execute_eval_with_semaphore(**inner_kwargs):
661649 row_groups = defaultdict (list ) # key: row_id, value: list of rollout_result
662650 tasks : List [asyncio .Task [List [EvaluationRow ]]] = []
663651 # completion_groups = []
664- for idx , cp in enumerate (original_completion_params_list ):
652+ for idx , cp in enumerate (original_completion_params ):
665653 config = RolloutProcessorConfig (
666654 completion_params = cp ,
667655 mcp_config_path = mcp_config_path or "" ,
@@ -744,7 +732,7 @@ async def _collect_result(config, lst):
744732 # rollout_id is used to differentiate the result from different completion_params
745733 if mode == "groupwise" :
746734 results_by_group = [
747- [[] for _ in range (num_runs )] for _ in range (len (original_completion_params_list ))
735+ [[] for _ in range (num_runs )] for _ in range (len (original_completion_params ))
748736 ]
749737 for i_run , result in enumerate (all_results ):
750738 for r in result :
@@ -757,7 +745,7 @@ async def _collect_result(config, lst):
757745 threshold ,
758746 active_logger ,
759747 mode ,
760- original_completion_params_list [rollout_id ],
748+ original_completion_params [rollout_id ],
761749 test_func .__name__ ,
762750 num_runs ,
763751 )
0 commit comments