diff --git a/verifiers/scripts/eval.py b/verifiers/scripts/eval.py index f00580f24..00e07f29d 100644 --- a/verifiers/scripts/eval.py +++ b/verifiers/scripts/eval.py @@ -338,6 +338,12 @@ def main(): default=0, help="Max retries for transient infrastructure errors (default: 0)", ) + parser.add_argument( + "--disable-env-server", + default=False, + action="store_true", + help="Do not start env servers when evaluating environments", + ) parser.add_argument( "--heartbeat-url", type=str, @@ -620,6 +626,7 @@ def build_eval_config(raw: dict) -> EvalConfig: rollouts_per_example=rollouts_per_example, max_concurrent=raw.get("max_concurrent", DEFAULT_MAX_CONCURRENT), max_retries=raw.get("max_retries", 0), + disable_env_server=raw.get("disable_env_server", False), verbose=raw.get("verbose", False), debug=raw.get("debug", False), state_columns=raw.get("state_columns", []), diff --git a/verifiers/types.py b/verifiers/types.py index 793bae542..30416e0a8 100644 --- a/verifiers/types.py +++ b/verifiers/types.py @@ -488,6 +488,7 @@ class EvalConfig(BaseModel): independent_scoring: bool = False extra_env_kwargs: dict = {} max_retries: int = 0 + disable_env_server: bool = False # logging verbose: bool = False debug: bool = False diff --git a/verifiers/utils/eval_utils.py b/verifiers/utils/eval_utils.py index e30f60ff6..d5ad273d4 100644 --- a/verifiers/utils/eval_utils.py +++ b/verifiers/utils/eval_utils.py @@ -327,6 +327,7 @@ def load_toml_config(path: Path) -> list[dict]: "max_concurrent", "independent_scoring", "max_retries", + "disable_env_server", # logging "verbose", "debug", @@ -585,22 +586,23 @@ async def run_evaluation( results_path = config.resume_path or get_eval_results_path(config) try: - if config.debug: - await vf_env.start_server( - extra_env_kwargs=config.extra_env_kwargs, - log_level=get_log_level(config.verbose), - ) - else: - log_file = results_path / "eval.log" - log_file.parent.mkdir(parents=True, exist_ok=True) - await vf_env.start_server( - extra_env_kwargs=config.extra_env_kwargs, - log_level="CRITICAL", # disable console logging - log_file=str(log_file), - log_file_level=get_log_level(config.verbose), - ) - if on_log_file is not None: - on_log_file(log_file) + if not config.disable_env_server: + if config.debug: + await vf_env.start_server( + extra_env_kwargs=config.extra_env_kwargs, + log_level=get_log_level(config.verbose), + ) + else: + log_file = results_path / "eval.log" + log_file.parent.mkdir(parents=True, exist_ok=True) + await vf_env.start_server( + extra_env_kwargs=config.extra_env_kwargs, + log_level="CRITICAL", # disable console logging + log_file=str(log_file), + log_file_level=get_log_level(config.verbose), + ) + if on_log_file is not None: + on_log_file(log_file) logger.debug(f"Starting evaluation with model: {config.model}") logger.debug( @@ -642,7 +644,8 @@ async def run_evaluation( on_log=on_log, ) finally: - await vf_env.stop_server() + if not config.disable_env_server: + await vf_env.stop_server() return outputs