Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions verifiers/scripts/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,12 @@ def main():
default=0,
help="Max retries for transient infrastructure errors (default: 0)",
)
parser.add_argument(
"--disable-env-server",
default=False,
action="store_true",
help="Do not start env servers when evaluating environments",
)
parser.add_argument(
"--heartbeat-url",
type=str,
Expand Down Expand Up @@ -620,6 +626,7 @@ def build_eval_config(raw: dict) -> EvalConfig:
rollouts_per_example=rollouts_per_example,
max_concurrent=raw.get("max_concurrent", DEFAULT_MAX_CONCURRENT),
max_retries=raw.get("max_retries", 0),
disable_env_server=raw.get("disable_env_server", False),
verbose=raw.get("verbose", False),
debug=raw.get("debug", False),
state_columns=raw.get("state_columns", []),
Expand Down
1 change: 1 addition & 0 deletions verifiers/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,7 @@ class EvalConfig(BaseModel):
independent_scoring: bool = False
extra_env_kwargs: dict = {}
max_retries: int = 0
disable_env_server: bool = False
# logging
verbose: bool = False
debug: bool = False
Expand Down
37 changes: 20 additions & 17 deletions verifiers/utils/eval_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,7 @@ def load_toml_config(path: Path) -> list[dict]:
"max_concurrent",
"independent_scoring",
"max_retries",
"disable_env_server",
# logging
"verbose",
"debug",
Expand Down Expand Up @@ -585,22 +586,23 @@ async def run_evaluation(
results_path = config.resume_path or get_eval_results_path(config)

try:
if config.debug:
await vf_env.start_server(
extra_env_kwargs=config.extra_env_kwargs,
log_level=get_log_level(config.verbose),
)
else:
log_file = results_path / "eval.log"
log_file.parent.mkdir(parents=True, exist_ok=True)
await vf_env.start_server(
extra_env_kwargs=config.extra_env_kwargs,
log_level="CRITICAL", # disable console logging
log_file=str(log_file),
log_file_level=get_log_level(config.verbose),
)
if on_log_file is not None:
on_log_file(log_file)
if not config.disable_env_server:
if config.debug:
await vf_env.start_server(
extra_env_kwargs=config.extra_env_kwargs,
log_level=get_log_level(config.verbose),
)
else:
log_file = results_path / "eval.log"
log_file.parent.mkdir(parents=True, exist_ok=True)
await vf_env.start_server(
extra_env_kwargs=config.extra_env_kwargs,
log_level="CRITICAL", # disable console logging
log_file=str(log_file),
log_file_level=get_log_level(config.verbose),
)
if on_log_file is not None:
on_log_file(log_file)

logger.debug(f"Starting evaluation with model: {config.model}")
logger.debug(
Expand Down Expand Up @@ -642,7 +644,8 @@ async def run_evaluation(
on_log=on_log,
)
finally:
await vf_env.stop_server()
if not config.disable_env_server:
await vf_env.stop_server()

return outputs

Expand Down
Loading