diff --git a/configs/eval/human-debug.toml b/configs/eval/human-debug.toml new file mode 100644 index 000000000..fa68687fc --- /dev/null +++ b/configs/eval/human-debug.toml @@ -0,0 +1,7 @@ +save_results = true + +[[eval]] +env_id = "primeintellect/wordle" +num_examples = 1 +rollouts_per_example = 1 +human_debug = true diff --git a/docs/evaluation.md b/docs/evaluation.md index 7b3a2e69e..3872b5ec3 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -154,6 +154,7 @@ The `--max-retries` flag enables automatic retry with exponential backoff when r | `--verbose` | `-v` | false | Enable debug logging | | `--tui` | `-u` | false | Use alternate screen mode (TUI) for display | | `--debug` | `-d` | false | Disable Rich display; use normal logging and tqdm progress | +| `--human-debug` | — | false | Use interactive human input for model responses (text-only) | | `--save-results` | `-s` | false | Save results to disk | | `--resume [PATH]` | `-R` | — | Resume from a previous run (auto-detect latest matching incomplete run if PATH omitted) | | `--state-columns` | `-C` | — | Extra state columns to save (comma-separated) | @@ -166,6 +167,21 @@ Results are saved to `./outputs/evals/{env_id}--{model}/{run_id}/`, containing: - `results.jsonl` — rollout outputs, one per line - `metadata.json` — evaluation configuration and aggregate metrics +### Human Debug Mode + +Use `--human-debug` to replace API model calls with terminal-entered responses: + +```bash +prime eval run my-env --human-debug -n 3 -r 1 -s +``` + +In this mode: +- Responses are entered interactively in the CLI and ended with `:wq` on its own line +- Only text responses are supported (tool calls are not supported) +- Exactly one eval config is supported per run +- Execution is forced to sequential + independent scoring (`max_concurrent=1`, `independent_scoring=true`) +- TUI display is disabled automatically to avoid stdin/display conflicts + ### Resuming Evaluations Long-running evaluations can be interrupted and resumed using checkpointing. When `--save-results` is enabled, results are saved incrementally after each completed group of rollouts. Use `--resume` to continue from where you left off. Pass a path to resume a specific run, or omit the path to auto-detect the latest incomplete matching run. @@ -290,6 +306,7 @@ Each `[[eval]]` section must contain an `env_id` field. All other fields are opt | `extra_env_kwargs` | table | Arguments passed to environment constructor | | `model` | string | Model to evaluate | | `endpoint_id` | string | Endpoint registry id (requires TOML `endpoints_path`) | +| `human_debug` | boolean | Use interactive human-entered model responses (single eval only) | Example with `env_args`: diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py index 42a62e2b7..d3806ea81 100644 --- a/tests/test_eval_cli.py +++ b/tests/test_eval_cli.py @@ -57,6 +57,7 @@ def _run_cli( "max_retries": 0, "tui": False, "debug": False, + "human_debug": False, "heartbeat_url": None, } base_args.update(overrides) @@ -116,6 +117,35 @@ def test_cli_single_env_id(monkeypatch, run_cli): assert configs[0].env_id == "env1" +def test_cli_human_debug_sets_flag(monkeypatch, run_cli): + captured = run_cli( + monkeypatch, + { + "env_id_or_config": "env1", + "human_debug": True, + }, + ) + + config = captured["configs"][0] + assert config.human_debug is True + + +def test_cli_human_debug_forces_safe_runtime(monkeypatch, run_cli): + captured = run_cli( + monkeypatch, + { + "env_id_or_config": "env1", + "human_debug": True, + "max_concurrent": 8, + "independent_scoring": False, + }, + ) + + config = captured["configs"][0] + assert config.max_concurrent == 1 + assert config.independent_scoring is True + + def test_cli_sampling_args_precedence_over_flags(monkeypatch, run_cli): """sampling_args JSON takes precedence over individual flags.""" captured = run_cli( @@ -469,6 +499,15 @@ def test_load_toml_config_single_eval(): assert result[0]["env_id"] == "env1" +def test_load_toml_config_accepts_human_debug(): + with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f: + f.write('[[eval]]\nenv_id = "env1"\nhuman_debug = true\n') + f.flush() + result = load_toml_config(Path(f.name)) + assert len(result) == 1 + assert result[0]["human_debug"] is True + + def test_repo_eval_example_configs_are_valid(): """Bundled example configs should parse with the current eval config schema.""" config_paths = sorted(Path("configs/eval").glob("*.toml")) @@ -569,6 +608,23 @@ def test_cli_multi_env_via_toml_config(monkeypatch, run_cli): assert configs[1].env_id == "env2" +def test_cli_human_debug_rejects_multi_eval_config(monkeypatch, run_cli): + with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f: + f.write( + 'human_debug = true\n[[eval]]\nenv_id = "env1"\n[[eval]]\nenv_id = "env2"\n' + ) + f.flush() + with pytest.raises( + ValueError, match="human_debug mode only supports a single evaluation" + ): + run_cli( + monkeypatch, + { + "env_id_or_config": f.name, + }, + ) + + def test_cli_toml_ignores_cli_args(monkeypatch, run_cli): """TOML config ignores CLI args, uses defaults for unspecified values.""" with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f: diff --git a/tests/test_human_cli_client.py b/tests/test_human_cli_client.py new file mode 100644 index 000000000..e25ee8b63 --- /dev/null +++ b/tests/test_human_cli_client.py @@ -0,0 +1,95 @@ +import builtins + +import pytest + +from verifiers.clients.human_cli_client import HumanCLIClient +from verifiers.errors import ModelError +from verifiers.types import SystemMessage, Tool, UserMessage + + +@pytest.mark.asyncio +async def test_human_cli_client_returns_multiline_response(monkeypatch): + responses = iter(["first line", "second line", ":wq"]) + monkeypatch.setattr(builtins, "input", lambda: next(responses)) + + client = HumanCLIClient() + response = await client.get_response( + prompt=[UserMessage(content="hello")], + model="test-model", + sampling_args={}, + ) + + assert response.model == "test-model" + assert response.message.content == "first line\nsecond line" + assert response.message.finish_reason == "stop" + assert response.message.tool_calls is None + + +@pytest.mark.asyncio +async def test_human_cli_client_reprompts_on_empty_response(monkeypatch): + responses = iter(["", ":wq", "final answer", ":wq"]) + monkeypatch.setattr(builtins, "input", lambda: next(responses)) + + client = HumanCLIClient() + response = await client.get_response( + prompt=[UserMessage(content="hello")], + model="test-model", + sampling_args={}, + ) + + assert response.message.content == "final answer" + + +@pytest.mark.asyncio +async def test_human_cli_client_rejects_tool_calls(monkeypatch): + responses = iter(["answer", ":wq"]) + monkeypatch.setattr(builtins, "input", lambda: next(responses)) + + client = HumanCLIClient() + with pytest.raises(ModelError, match="text-only"): + await client.get_response( + prompt=[UserMessage(content="hello")], + model="test-model", + sampling_args={}, + tools=[ + Tool( + name="my_tool", + description="test tool", + parameters={"type": "object", "properties": {}}, + ) + ], + ) + + +@pytest.mark.asyncio +async def test_human_cli_client_propagates_keyboard_interrupt(monkeypatch): + def raise_interrupt(): + raise KeyboardInterrupt + + monkeypatch.setattr(builtins, "input", raise_interrupt) + + client = HumanCLIClient() + with pytest.raises(KeyboardInterrupt): + await client.get_response( + prompt=[UserMessage(content="hello")], + model="test-model", + sampling_args={}, + ) + + +@pytest.mark.asyncio +async def test_human_cli_client_renders_prompt_without_crashing(monkeypatch): + responses = iter(["done", ":wq"]) + monkeypatch.setattr(builtins, "input", lambda: next(responses)) + + client = HumanCLIClient() + response = await client.get_response( + prompt=[ + SystemMessage(content="You are helpful"), + UserMessage(content="Solve this"), + ], + model="test-model", + sampling_args={}, + ) + + assert response.message.content == "done" diff --git a/verifiers/clients/__init__.py b/verifiers/clients/__init__.py index 3d8ee627a..51a95e8de 100644 --- a/verifiers/clients/__init__.py +++ b/verifiers/clients/__init__.py @@ -2,6 +2,7 @@ from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient from verifiers.clients.client import Client +from verifiers.clients.human_cli_client import HumanCLIClient from verifiers.clients.openai_chat_completions_client import OpenAIChatCompletionsClient from verifiers.clients.openai_chat_completions_token_client import ( OpenAIChatCompletionsTokenClient, @@ -35,5 +36,6 @@ def resolve_client(client_or_config: Client | ClientConfig) -> Client: "OpenAICompletionsClient", "OpenAIChatCompletionsClient", "OpenAIChatCompletionsTokenClient", + "HumanCLIClient", "Client", ] diff --git a/verifiers/clients/human_cli_client.py b/verifiers/clients/human_cli_client.py new file mode 100644 index 000000000..9d84c5cbe --- /dev/null +++ b/verifiers/clients/human_cli_client.py @@ -0,0 +1,111 @@ +import time +import uuid + +from rich.console import Console + +from verifiers.clients.client import Client +from verifiers.errors import EmptyModelResponseError, ModelError +from verifiers.types import ( + ClientConfig, + Messages, + Response, + ResponseMessage, + SamplingArgs, + Tool, +) +from verifiers.utils.logging_utils import format_messages + + +class HumanCLIClient(Client[None, Messages, Response, Tool]): + """Client that captures assistant responses from a human in the terminal.""" + + def __init__(self, sentinel: str = ":wq") -> None: + self.sentinel = sentinel + self._console = Console() + super().__init__(None) + + def setup_client(self, config: ClientConfig) -> None: + return None + + async def to_native_tool(self, tool: Tool) -> Tool: + return tool + + async def to_native_prompt(self, messages: Messages) -> tuple[Messages, dict]: + return messages, {} + + async def get_native_response( + self, + prompt: Messages, + model: str, + sampling_args: SamplingArgs, + tools: list[Tool] | None = None, + **kwargs, + ) -> Response: + raise NotImplementedError( + "HumanCLIClient.get_native_response is not used. Call get_response()." + ) + + async def raise_from_native_response(self, response: Response) -> None: + return None + + async def from_native_response(self, response: Response) -> Response: + return response + + async def close(self) -> None: + return None + + def _read_human_response(self, prompt: Messages) -> str: + self._console.rule("Human Debug") + self._console.print(format_messages(prompt)) + self._console.print( + f"\nEnter assistant response. End input with `{self.sentinel}` on its own line." + ) + + while True: + lines: list[str] = [] + while True: + try: + line = input() + except EOFError as e: + raise EmptyModelResponseError( + "Reached EOF while waiting for human input" + ) from e + + if line.strip() == self.sentinel: + break + lines.append(line) + + response_text = "\n".join(lines).strip() + if response_text: + return "\n".join(lines) + + self._console.print("Empty response. Please enter a non-empty response.") + + async def get_response( + self, + prompt: Messages, + model: str, + sampling_args: SamplingArgs, + tools: list[Tool] | None = None, + **kwargs, + ) -> Response: + if tools: + raise ModelError( + "Human debug mode is text-only and does not support tool calls." + ) + + content = self._read_human_response(prompt) + return Response( + id=f"human-{uuid.uuid4().hex}", + created=int(time.time()), + model=model, + usage=None, + message=ResponseMessage( + content=content, + reasoning_content=None, + finish_reason="stop", + is_truncated=False, + tokens=None, + tool_calls=None, + ), + ) diff --git a/verifiers/scripts/eval.py b/verifiers/scripts/eval.py index f00580f24..d7f5d8a72 100644 --- a/verifiers/scripts/eval.py +++ b/verifiers/scripts/eval.py @@ -332,6 +332,12 @@ def main(): action="store_true", help="Disable Rich display; use normal logging and tqdm progress instead", ) + parser.add_argument( + "--human-debug", + default=False, + action="store_true", + help="Use interactive human input for model responses (eval-only debug mode)", + ) parser.add_argument( "--max-retries", type=int, @@ -622,6 +628,7 @@ def build_eval_config(raw: dict) -> EvalConfig: max_retries=raw.get("max_retries", 0), verbose=raw.get("verbose", False), debug=raw.get("debug", False), + human_debug=raw.get("human_debug", False), state_columns=raw.get("state_columns", []), save_results=raw.get("save_results", False), resume_path=resume_path, @@ -644,14 +651,36 @@ def build_eval_config(raw: dict) -> EvalConfig: raise SystemExit(1) eval_configs = [build_eval_config(raw) for raw in raw_eval_configs] + human_debug_configs = [config for config in eval_configs if config.human_debug] + if human_debug_configs: + if len(eval_configs) != 1: + raise ValueError( + "human_debug mode only supports a single evaluation per run." + ) + config = human_debug_configs[0] + if config.max_concurrent != 1: + logger.info( + "human_debug mode requires sequential execution; forcing max_concurrent=1" + ) + config.max_concurrent = 1 + if not config.independent_scoring: + logger.info( + "human_debug mode requires independent scoring; enabling independent_scoring=True" + ) + config.independent_scoring = True + for config in eval_configs: logger.debug(f"Evaluation config: {config.model_dump_json(indent=2)}") eval_run_config = EvalRunConfig( evals=eval_configs, heartbeat_url=args.heartbeat_url ) - if args.debug: - asyncio.run(run_evaluations(eval_run_config)) + if args.debug or human_debug_configs: + try: + asyncio.run(run_evaluations(eval_run_config)) + except KeyboardInterrupt: + logger.info("Interrupted by user.") + raise SystemExit(130) else: asyncio.run(run_evaluations_tui(eval_run_config, tui_mode=args.tui)) diff --git a/verifiers/types.py b/verifiers/types.py index 793bae542..84008ea6a 100644 --- a/verifiers/types.py +++ b/verifiers/types.py @@ -486,6 +486,7 @@ class EvalConfig(BaseModel): rollouts_per_example: int max_concurrent: int independent_scoring: bool = False + human_debug: bool = False extra_env_kwargs: dict = {} max_retries: int = 0 # logging diff --git a/verifiers/utils/eval_utils.py b/verifiers/utils/eval_utils.py index e30f60ff6..2911c7716 100644 --- a/verifiers/utils/eval_utils.py +++ b/verifiers/utils/eval_utils.py @@ -17,6 +17,7 @@ from datasets.utils import logging as ds_logging import verifiers as vf +from verifiers.clients.human_cli_client import HumanCLIClient from verifiers.types import ( ClientType, Endpoint, @@ -326,6 +327,7 @@ def load_toml_config(path: Path) -> list[dict]: "rollouts_per_example", "max_concurrent", "independent_scoring", + "human_debug", "max_retries", # logging "verbose", @@ -585,22 +587,28 @@ async def run_evaluation( results_path = config.resume_path or get_eval_results_path(config) try: - if config.debug: - await vf_env.start_server( - extra_env_kwargs=config.extra_env_kwargs, - log_level=get_log_level(config.verbose), - ) + if config.human_debug: + logger.info("Running evaluation in human debug mode") + client = HumanCLIClient() else: - log_file = results_path / "eval.log" - log_file.parent.mkdir(parents=True, exist_ok=True) - await vf_env.start_server( - extra_env_kwargs=config.extra_env_kwargs, - log_level="CRITICAL", # disable console logging - log_file=str(log_file), - log_file_level=get_log_level(config.verbose), - ) - if on_log_file is not None: - on_log_file(log_file) + if config.debug: + await vf_env.start_server( + extra_env_kwargs=config.extra_env_kwargs, + log_level=get_log_level(config.verbose), + ) + else: + log_file = results_path / "eval.log" + log_file.parent.mkdir(parents=True, exist_ok=True) + await vf_env.start_server( + extra_env_kwargs=config.extra_env_kwargs, + log_level="CRITICAL", # disable console logging + log_file=str(log_file), + log_file_level=get_log_level(config.verbose), + ) + if on_log_file is not None: + on_log_file(log_file) + + client = config.client_config logger.debug(f"Starting evaluation with model: {config.model}") logger.debug( @@ -624,7 +632,7 @@ async def run_evaluation( ) outputs = await vf_env.evaluate( - client=config.client_config, + client=client, model=config.model, sampling_args=config.sampling_args, num_examples=config.num_examples,