diff --git a/configs/eval/human-debug.toml b/configs/eval/human-debug.toml
new file mode 100644
index 000000000..fa68687fc
--- /dev/null
+++ b/configs/eval/human-debug.toml
@@ -0,0 +1,7 @@
+save_results = true
+
+[[eval]]
+env_id = "primeintellect/wordle"
+num_examples = 1
+rollouts_per_example = 1
+human_debug = true
diff --git a/docs/evaluation.md b/docs/evaluation.md
index 7b3a2e69e..3872b5ec3 100644
--- a/docs/evaluation.md
+++ b/docs/evaluation.md
@@ -154,6 +154,7 @@ The `--max-retries` flag enables automatic retry with exponential backoff when r
 | `--verbose` | `-v` | false | Enable debug logging |
 | `--tui` | `-u` | false | Use alternate screen mode (TUI) for display |
 | `--debug` | `-d` | false | Disable Rich display; use normal logging and tqdm progress |
+| `--human-debug` | — | false | Use interactive human input for model responses (text-only) |
 | `--save-results` | `-s` | false | Save results to disk |
 | `--resume [PATH]` | `-R` | — | Resume from a previous run (auto-detect latest matching incomplete run if PATH omitted) |
 | `--state-columns` | `-C` | — | Extra state columns to save (comma-separated) |
@@ -166,6 +167,21 @@ Results are saved to `./outputs/evals/{env_id}--{model}/{run_id}/`, containing:
 - `results.jsonl` — rollout outputs, one per line
 - `metadata.json` — evaluation configuration and aggregate metrics
 
+### Human Debug Mode
+
+Use `--human-debug` to replace API model calls with terminal-entered responses:
+
+```bash
+prime eval run my-env --human-debug -n 3 -r 1 -s
+```
+
+In this mode:
+- Responses are entered interactively in the CLI and ended with `:wq` on its own line
+- Only text responses are supported (tool calls are not supported)
+- Exactly one eval config is supported per run
+- Execution is forced to sequential + independent scoring (`max_concurrent=1`, `independent_scoring=true`)
+- TUI display is disabled automatically to avoid stdin/display conflicts
+
 ### Resuming Evaluations
 
 Long-running evaluations can be interrupted and resumed using checkpointing. When `--save-results` is enabled, results are saved incrementally after each completed group of rollouts. Use `--resume` to continue from where you left off. Pass a path to resume a specific run, or omit the path to auto-detect the latest incomplete matching run.
@@ -290,6 +306,7 @@ Each `[[eval]]` section must contain an `env_id` field. All other fields are opt
 | `extra_env_kwargs` | table | Arguments passed to environment constructor |
 | `model` | string | Model to evaluate |
 | `endpoint_id` | string | Endpoint registry id (requires TOML `endpoints_path`) |
+| `human_debug` | boolean | Use interactive human-entered model responses (single eval only) |
 
 Example with `env_args`:
 
diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py
index 42a62e2b7..d3806ea81 100644
--- a/tests/test_eval_cli.py
+++ b/tests/test_eval_cli.py
@@ -57,6 +57,7 @@ def _run_cli(
             "max_retries": 0,
             "tui": False,
             "debug": False,
+            "human_debug": False,
             "heartbeat_url": None,
         }
         base_args.update(overrides)
@@ -116,6 +117,35 @@ def test_cli_single_env_id(monkeypatch, run_cli):
     assert configs[0].env_id == "env1"
 
 
+def test_cli_human_debug_sets_flag(monkeypatch, run_cli):
+    captured = run_cli(
+        monkeypatch,
+        {
+            "env_id_or_config": "env1",
+            "human_debug": True,
+        },
+    )
+
+    config = captured["configs"][0]
+    assert config.human_debug is True
+
+
+def test_cli_human_debug_forces_safe_runtime(monkeypatch, run_cli):
+    captured = run_cli(
+        monkeypatch,
+        {
+            "env_id_or_config": "env1",
+            "human_debug": True,
+            "max_concurrent": 8,
+            "independent_scoring": False,
+        },
+    )
+
+    config = captured["configs"][0]
+    assert config.max_concurrent == 1
+    assert config.independent_scoring is True
+
+
 def test_cli_sampling_args_precedence_over_flags(monkeypatch, run_cli):
     """sampling_args JSON takes precedence over individual flags."""
     captured = run_cli(
@@ -469,6 +499,15 @@ def test_load_toml_config_single_eval():
         assert result[0]["env_id"] == "env1"
 
 
+def test_load_toml_config_accepts_human_debug():
+    with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
+        f.write('[[eval]]\nenv_id = "env1"\nhuman_debug = true\n')
+        f.flush()
+        result = load_toml_config(Path(f.name))
+        assert len(result) == 1
+        assert result[0]["human_debug"] is True
+
+
 def test_repo_eval_example_configs_are_valid():
     """Bundled example configs should parse with the current eval config schema."""
     config_paths = sorted(Path("configs/eval").glob("*.toml"))
@@ -569,6 +608,23 @@ def test_cli_multi_env_via_toml_config(monkeypatch, run_cli):
     assert configs[1].env_id == "env2"
 
 
+def test_cli_human_debug_rejects_multi_eval_config(monkeypatch, run_cli):
+    with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
+        f.write(
+            'human_debug = true\n[[eval]]\nenv_id = "env1"\n[[eval]]\nenv_id = "env2"\n'
+        )
+        f.flush()
+        with pytest.raises(
+            ValueError, match="human_debug mode only supports a single evaluation"
+        ):
+            run_cli(
+                monkeypatch,
+                {
+                    "env_id_or_config": f.name,
+                },
+            )
+
+
 def test_cli_toml_ignores_cli_args(monkeypatch, run_cli):
     """TOML config ignores CLI args, uses defaults for unspecified values."""
     with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
diff --git a/tests/test_human_cli_client.py b/tests/test_human_cli_client.py
new file mode 100644
index 000000000..e25ee8b63
--- /dev/null
+++ b/tests/test_human_cli_client.py
@@ -0,0 +1,95 @@
+import builtins
+
+import pytest
+
+from verifiers.clients.human_cli_client import HumanCLIClient
+from verifiers.errors import ModelError
+from verifiers.types import SystemMessage, Tool, UserMessage
+
+
+@pytest.mark.asyncio
+async def test_human_cli_client_returns_multiline_response(monkeypatch):
+    responses = iter(["first line", "second line", ":wq"])
+    monkeypatch.setattr(builtins, "input", lambda: next(responses))
+
+    client = HumanCLIClient()
+    response = await client.get_response(
+        prompt=[UserMessage(content="hello")],
+        model="test-model",
+        sampling_args={},
+    )
+
+    assert response.model == "test-model"
+    assert response.message.content == "first line\nsecond line"
+    assert response.message.finish_reason == "stop"
+    assert response.message.tool_calls is None
+
+
+@pytest.mark.asyncio
+async def test_human_cli_client_reprompts_on_empty_response(monkeypatch):
+    responses = iter(["", ":wq", "final answer", ":wq"])
+    monkeypatch.setattr(builtins, "input", lambda: next(responses))
+
+    client = HumanCLIClient()
+    response = await client.get_response(
+        prompt=[UserMessage(content="hello")],
+        model="test-model",
+        sampling_args={},
+    )
+
+    assert response.message.content == "final answer"
+
+
+@pytest.mark.asyncio
+async def test_human_cli_client_rejects_tool_calls(monkeypatch):
+    responses = iter(["answer", ":wq"])
+    monkeypatch.setattr(builtins, "input", lambda: next(responses))
+
+    client = HumanCLIClient()
+    with pytest.raises(ModelError, match="text-only"):
+        await client.get_response(
+            prompt=[UserMessage(content="hello")],
+            model="test-model",
+            sampling_args={},
+            tools=[
+                Tool(
+                    name="my_tool",
+                    description="test tool",
+                    parameters={"type": "object", "properties": {}},
+                )
+            ],
+        )
+
+
+@pytest.mark.asyncio
+async def test_human_cli_client_propagates_keyboard_interrupt(monkeypatch):
+    def raise_interrupt():
+        raise KeyboardInterrupt
+
+    monkeypatch.setattr(builtins, "input", raise_interrupt)
+
+    client = HumanCLIClient()
+    with pytest.raises(KeyboardInterrupt):
+        await client.get_response(
+            prompt=[UserMessage(content="hello")],
+            model="test-model",
+            sampling_args={},
+        )
+
+
+@pytest.mark.asyncio
+async def test_human_cli_client_renders_prompt_without_crashing(monkeypatch):
+    responses = iter(["done", ":wq"])
+    monkeypatch.setattr(builtins, "input", lambda: next(responses))
+
+    client = HumanCLIClient()
+    response = await client.get_response(
+        prompt=[
+            SystemMessage(content="You are helpful"),
+            UserMessage(content="Solve this"),
+        ],
+        model="test-model",
+        sampling_args={},
+    )
+
+    assert response.message.content == "done"
diff --git a/verifiers/clients/__init__.py b/verifiers/clients/__init__.py
index 3d8ee627a..51a95e8de 100644
--- a/verifiers/clients/__init__.py
+++ b/verifiers/clients/__init__.py
@@ -2,6 +2,7 @@
 
 from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient
 from verifiers.clients.client import Client
+from verifiers.clients.human_cli_client import HumanCLIClient
 from verifiers.clients.openai_chat_completions_client import OpenAIChatCompletionsClient
 from verifiers.clients.openai_chat_completions_token_client import (
     OpenAIChatCompletionsTokenClient,
@@ -35,5 +36,6 @@ def resolve_client(client_or_config: Client | ClientConfig) -> Client:
     "OpenAICompletionsClient",
     "OpenAIChatCompletionsClient",
     "OpenAIChatCompletionsTokenClient",
+    "HumanCLIClient",
     "Client",
 ]
diff --git a/verifiers/clients/human_cli_client.py b/verifiers/clients/human_cli_client.py
new file mode 100644
index 000000000..9d84c5cbe
--- /dev/null
+++ b/verifiers/clients/human_cli_client.py
@@ -0,0 +1,111 @@
+import time
+import uuid
+
+from rich.console import Console
+
+from verifiers.clients.client import Client
+from verifiers.errors import EmptyModelResponseError, ModelError
+from verifiers.types import (
+    ClientConfig,
+    Messages,
+    Response,
+    ResponseMessage,
+    SamplingArgs,
+    Tool,
+)
+from verifiers.utils.logging_utils import format_messages
+
+
+class HumanCLIClient(Client[None, Messages, Response, Tool]):
+    """Client that captures assistant responses from a human in the terminal."""
+
+    def __init__(self, sentinel: str = ":wq") -> None:
+        self.sentinel = sentinel
+        self._console = Console()
+        super().__init__(None)
+
+    def setup_client(self, config: ClientConfig) -> None:
+        return None
+
+    async def to_native_tool(self, tool: Tool) -> Tool:
+        return tool
+
+    async def to_native_prompt(self, messages: Messages) -> tuple[Messages, dict]:
+        return messages, {}
+
+    async def get_native_response(
+        self,
+        prompt: Messages,
+        model: str,
+        sampling_args: SamplingArgs,
+        tools: list[Tool] | None = None,
+        **kwargs,
+    ) -> Response:
+        raise NotImplementedError(
+            "HumanCLIClient.get_native_response is not used. Call get_response()."
+        )
+
+    async def raise_from_native_response(self, response: Response) -> None:
+        return None
+
+    async def from_native_response(self, response: Response) -> Response:
+        return response
+
+    async def close(self) -> None:
+        return None
+
+    def _read_human_response(self, prompt: Messages) -> str:
+        self._console.rule("Human Debug")
+        self._console.print(format_messages(prompt))
+        self._console.print(
+            f"\nEnter assistant response. End input with `{self.sentinel}` on its own line."
+        )
+
+        while True:
+            lines: list[str] = []
+            while True:
+                try:
+                    line = input()
+                except EOFError as e:
+                    raise EmptyModelResponseError(
+                        "Reached EOF while waiting for human input"
+                    ) from e
+
+                if line.strip() == self.sentinel:
+                    break
+                lines.append(line)
+
+            response_text = "\n".join(lines).strip()
+            if response_text:
+                return "\n".join(lines)
+
+            self._console.print("Empty response. Please enter a non-empty response.")
+
+    async def get_response(
+        self,
+        prompt: Messages,
+        model: str,
+        sampling_args: SamplingArgs,
+        tools: list[Tool] | None = None,
+        **kwargs,
+    ) -> Response:
+        if tools:
+            raise ModelError(
+                "Human debug mode is text-only and does not support tool calls."
+            )
+
+        content = self._read_human_response(prompt)
+        return Response(
+            id=f"human-{uuid.uuid4().hex}",
+            created=int(time.time()),
+            model=model,
+            usage=None,
+            message=ResponseMessage(
+                content=content,
+                reasoning_content=None,
+                finish_reason="stop",
+                is_truncated=False,
+                tokens=None,
+                tool_calls=None,
+            ),
+        )
diff --git a/verifiers/scripts/eval.py b/verifiers/scripts/eval.py
index f00580f24..d7f5d8a72 100644
--- a/verifiers/scripts/eval.py
+++ b/verifiers/scripts/eval.py
@@ -332,6 +332,12 @@ def main():
         action="store_true",
         help="Disable Rich display; use normal logging and tqdm progress instead",
     )
+    parser.add_argument(
+        "--human-debug",
+        default=False,
+        action="store_true",
+        help="Use interactive human input for model responses (eval-only debug mode)",
+    )
     parser.add_argument(
         "--max-retries",
         type=int,
@@ -622,6 +628,7 @@ def build_eval_config(raw: dict) -> EvalConfig:
             max_retries=raw.get("max_retries", 0),
             verbose=raw.get("verbose", False),
             debug=raw.get("debug", False),
+            human_debug=raw.get("human_debug", False),
             state_columns=raw.get("state_columns", []),
             save_results=raw.get("save_results", False),
             resume_path=resume_path,
@@ -644,14 +651,36 @@ def build_eval_config(raw: dict) -> EvalConfig:
         raise SystemExit(1)
 
     eval_configs = [build_eval_config(raw) for raw in raw_eval_configs]
+    human_debug_configs = [config for config in eval_configs if config.human_debug]
+    if human_debug_configs:
+        if len(eval_configs) != 1:
+            raise ValueError(
+                "human_debug mode only supports a single evaluation per run."
+            )
+        config = human_debug_configs[0]
+        if config.max_concurrent != 1:
+            logger.info(
+                "human_debug mode requires sequential execution; forcing max_concurrent=1"
+            )
+            config.max_concurrent = 1
+        if not config.independent_scoring:
+            logger.info(
+                "human_debug mode requires independent scoring; enabling independent_scoring=True"
+            )
+            config.independent_scoring = True
+
     for config in eval_configs:
         logger.debug(f"Evaluation config: {config.model_dump_json(indent=2)}")
 
     eval_run_config = EvalRunConfig(
         evals=eval_configs, heartbeat_url=args.heartbeat_url
     )
-    if args.debug:
-        asyncio.run(run_evaluations(eval_run_config))
+    if args.debug or human_debug_configs:
+        try:
+            asyncio.run(run_evaluations(eval_run_config))
+        except KeyboardInterrupt:
+            logger.info("Interrupted by user.")
+            raise SystemExit(130)
     else:
         asyncio.run(run_evaluations_tui(eval_run_config, tui_mode=args.tui))
 
diff --git a/verifiers/types.py b/verifiers/types.py
index 793bae542..84008ea6a 100644
--- a/verifiers/types.py
+++ b/verifiers/types.py
@@ -486,6 +486,7 @@ class EvalConfig(BaseModel):
     rollouts_per_example: int
     max_concurrent: int
     independent_scoring: bool = False
+    human_debug: bool = False
     extra_env_kwargs: dict = {}
     max_retries: int = 0
     # logging
diff --git a/verifiers/utils/eval_utils.py b/verifiers/utils/eval_utils.py
index e30f60ff6..2911c7716 100644
--- a/verifiers/utils/eval_utils.py
+++ b/verifiers/utils/eval_utils.py
@@ -17,6 +17,7 @@
 from datasets.utils import logging as ds_logging
 
 import verifiers as vf
+from verifiers.clients.human_cli_client import HumanCLIClient
 from verifiers.types import (
     ClientType,
     Endpoint,
@@ -326,6 +327,7 @@ def load_toml_config(path: Path) -> list[dict]:
         "rollouts_per_example",
         "max_concurrent",
         "independent_scoring",
+        "human_debug",
         "max_retries",
         # logging
         "verbose",
@@ -585,22 +587,28 @@ async def run_evaluation(
     results_path = config.resume_path or get_eval_results_path(config)
 
     try:
-        if config.debug:
-            await vf_env.start_server(
-                extra_env_kwargs=config.extra_env_kwargs,
-                log_level=get_log_level(config.verbose),
-            )
+        if config.human_debug:
+            logger.info("Running evaluation in human debug mode")
+            client = HumanCLIClient()
         else:
-            log_file = results_path / "eval.log"
-            log_file.parent.mkdir(parents=True, exist_ok=True)
-            await vf_env.start_server(
-                extra_env_kwargs=config.extra_env_kwargs,
-                log_level="CRITICAL",  # disable console logging
-                log_file=str(log_file),
-                log_file_level=get_log_level(config.verbose),
-            )
-            if on_log_file is not None:
-                on_log_file(log_file)
+            if config.debug:
+                await vf_env.start_server(
+                    extra_env_kwargs=config.extra_env_kwargs,
+                    log_level=get_log_level(config.verbose),
+                )
+            else:
+                log_file = results_path / "eval.log"
+                log_file.parent.mkdir(parents=True, exist_ok=True)
+                await vf_env.start_server(
+                    extra_env_kwargs=config.extra_env_kwargs,
+                    log_level="CRITICAL",  # disable console logging
+                    log_file=str(log_file),
+                    log_file_level=get_log_level(config.verbose),
+                )
+                if on_log_file is not None:
+                    on_log_file(log_file)
+
+            client = config.client_config
 
         logger.debug(f"Starting evaluation with model: {config.model}")
         logger.debug(
@@ -624,7 +632,7 @@ async def run_evaluation(
                 )
 
         outputs = await vf_env.evaluate(
-            client=config.client_config,
+            client=client,
             model=config.model,
             sampling_args=config.sampling_args,
             num_examples=config.num_examples,