diff --git a/evals/local-demo.yaml b/evals/local-demo.yaml
new file mode 100644
index 00000000..3e6d6b4c
--- /dev/null
+++ b/evals/local-demo.yaml
@@ -0,0 +1,24 @@
+# Full local demo: the with/without-Lightcone A/B across three harnesses on the
+# snae task, run in local Docker. The matrix is {claude, codex, pi} x
+# {with-skills, without-skills}. The report's "Δ lift" column is the headline —
+# how much the Lightcone layer moved each harness's score.
+#
+# Fair, comparable model tier across harnesses (codex Spark was retired): codex
+# and pi both run gpt-5.4-mini, claude runs haiku. pi reaches gpt-5.4-mini via
+# Cail's GitHub Copilot subscription using pi's native `provider/model` string.
+# claude needs CLAUDE_CODE_OAUTH_TOKEN in the host env (loaded from .env).
+id: local-demo
+backend: local_docker
+tasks:
+  - snae
+harnesses:
+  - { name: claude, model: haiku }
+  - { name: codex, model: gpt-5.4-mini }
+  - { name: pi, model: github-copilot/gpt-5.4-mini }
+skill_variants: [true, false]
+num_trials: 1
+max_concurrency: 3
+# No turn cap — agents run to completion (claude uses the task's max_turns=200).
+# trial_timeout is just a safety ceiling against a hung agent, not a turn cap.
+trial_timeout: 1800
+output_dir: eval-results
diff --git a/evals/local-smoke.yaml b/evals/local-smoke.yaml
new file mode 100644
index 00000000..5de8266c
--- /dev/null
+++ b/evals/local-smoke.yaml
@@ -0,0 +1,22 @@
+# Cheap plumbing smoke for the local-Docker multi-harness path. `max_turns`
+# caps each trial so it exits fast: the build won't complete (graders score
+# low), but the whole path — container build, auth copy-in, headless invoke,
+# output parse, grading, teardown, scorecard — is exercised end to end across
+# all three harnesses. Run this first to shake out orchestration/auth bugs
+# before spending on a full build.
+id: local-smoke
+backend: local_docker
+tasks:
+  - snae
+harnesses:
+  - claude
+  - codex
+  - pi
+skill_variants: [true]
+num_trials: 1
+max_concurrency: 1
+# max_turns only bounds claude; codex and pi have no max-turns flag, so a SHORT
+# trial_timeout is what keeps a smoke cheap for them.
+max_turns: 5
+trial_timeout: 180
+output_dir: eval-results
diff --git a/evals/tasks/snae/astra.yaml b/evals/tasks/snae/astra.yaml
index 464e8122..c1b5f121 100644
--- a/evals/tasks/snae/astra.yaml
+++ b/evals/tasks/snae/astra.yaml
@@ -1,13 +1,31 @@
 # ASTRA Analysis Specification
 # Documentation: https://github.com/LightconeResearch/ASTRA
 
+id: snae
 version: "1.0"
 name: "snae"
-description: |
-  Fit the Union2.1 Type Ia supernova distance modulus vs redshift data
-  to a flat LCDM cosmological model with two free parameters (H0, Omega_L)
-  using maximum-likelihood (MAP) point estimation. This provides best-fit
-  cosmological parameters as a building block for a larger analysis.
+
+narrative:
+  summary: |
+    Fit the Union2.1 Type Ia supernova distance modulus vs redshift data
+    to a flat LCDM cosmological model with two free parameters (H0, Omega_L)
+    using maximum-likelihood (MAP) point estimation. This provides best-fit
+    cosmological parameters as a building block for a larger analysis.
+  inputs: |
+    The single input is the [Union2.1 compilation](#inputs.union21): 580
+    Type Ia supernovae with redshift, distance modulus, and uncertainties.
+  methods: |
+    The fit minimizes a chi-squared between the observed distance moduli and
+    the flat-LCDM prediction, varying H0 and Omega_L. Three decisions shape
+    the fit: the [optimizer](#decisions.optimizer) used for the minimization,
+    the [error model](#decisions.error_model) (statistical-only vs.
+    statistical+systematic uncertainties), and a [low-redshift cut](#decisions.redshift_cut)
+    that optionally removes peculiar-velocity-dominated supernovae.
+  outputs: |
+    Three outputs: the [best-fit parameters](#outputs.best_fit) (H0, Omega_L,
+    reduced chi-squared), a [Hubble diagram](#outputs.hubble_diagram) with the
+    best-fit model overlaid on the data, and a [residuals plot](#outputs.residuals)
+    of data minus model versus redshift.
 
 container: Containerfile
 
@@ -21,22 +39,38 @@ outputs:
   - id: best_fit
     type: metric
     description: "Best-fit H0 and Omega_L from chi-squared minimization, with reduced chi-squared"
+    inputs: [union21]
+    decisions: [optimizer, error_model, redshift_cut]
     recipe:
-      command: python scripts/fit.py
+      command: >-
+        python scripts/fit.py
+        --union21 {inputs.union21}
+        --optimizer {decisions.optimizer}
+        --error-model {decisions.error_model}
+        --redshift-cut {decisions.redshift_cut}
+        --out {output}
 
   - id: hubble_diagram
     type: figure
     description: "Hubble diagram: distance modulus vs redshift with best-fit model overlay"
+    inputs: [union21, best_fit]
     recipe:
-      command: python scripts/plot_hubble.py
-      inputs: [best_fit]
+      command: >-
+        python scripts/plot_hubble.py
+        --union21 {inputs.union21}
+        --best-fit {inputs.best_fit}
+        --out {output}
 
   - id: residuals
     type: figure
     description: "Residuals plot: data minus best-fit model vs redshift"
+    inputs: [union21, best_fit]
     recipe:
-      command: python scripts/plot_residuals.py
-      inputs: [best_fit]
+      command: >-
+        python scripts/plot_residuals.py
+        --union21 {inputs.union21}
+        --best-fit {inputs.best_fit}
+        --out {output}
 
 decisions:
   optimizer:
diff --git a/src/lightcone/eval/backends/__init__.py b/src/lightcone/eval/backends/__init__.py
new file mode 100644
index 00000000..eb9b64b2
--- /dev/null
+++ b/src/lightcone/eval/backends/__init__.py
@@ -0,0 +1,21 @@
+"""Sandbox backends for the eval harness.
+
+A backend is the execution substrate one trial runs inside. All backends share
+the :class:`Sandbox` surface, so a harness drives any of them unchanged.
+
+  - :class:`LocalDockerSandbox` — a local Docker container per trial. The
+    counterpart of the Daytona :class:`lightcone.eval.sandbox.EvalSandbox`, for
+    running the suite on a developer/CI host with Docker rather than a Daytona
+    account.
+"""
+
+from __future__ import annotations
+
+from lightcone.eval.backends.base import ExecuteResult, Sandbox
+from lightcone.eval.backends.local_docker import LocalDockerSandbox
+
+__all__ = [
+    "ExecuteResult",
+    "LocalDockerSandbox",
+    "Sandbox",
+]
diff --git a/src/lightcone/eval/backends/base.py b/src/lightcone/eval/backends/base.py
new file mode 100644
index 00000000..f1f18b87
--- /dev/null
+++ b/src/lightcone/eval/backends/base.py
@@ -0,0 +1,72 @@
+"""Sandbox backend abstraction for eval trials.
+
+A ``Sandbox`` is the execution substrate one eval trial runs inside. It mirrors
+the public surface of the original :class:`lightcone.eval.sandbox.EvalSandbox`
+(the Daytona backend) so harnesses — which depend only on the
+``SandboxLike`` protocol (``WORK_DIR``, ``exec``, ``exec_async_poll``,
+``upload_file``) — drive any backend unchanged.
+
+Backends:
+  - ``EvalSandbox`` (sandbox.py)        — ephemeral Daytona cloud sandbox
+  - ``LocalDockerSandbox`` (this pkg)   — a local Docker container per trial
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from pathlib import Path
+
+
+@dataclass
+class ExecuteResult:
+    """Result from running a command in a sandbox."""
+
+    exit_code: int
+    output: str
+
+
+class Sandbox(ABC):
+    """One ephemeral execution substrate for a single eval trial.
+
+    The lifecycle is ``create() → setup() → (exec/exec_async_poll/upload_file)* →
+    teardown()``. Subclasses provide the concrete substrate (Daytona cloud
+    sandbox, local Docker container, …); the abstract surface here is exactly
+    what the trial loop and the harness layer consume.
+    """
+
+    #: Project root inside the sandbox — where ``lc init`` scaffolds and the
+    #: agent runs. Mirrors ``EvalSandbox.WORK_DIR``.
+    WORK_DIR = "/home/evaluser/project"
+
+    @abstractmethod
+    def create(self) -> None:
+        """Provision the substrate (build image if needed, start the sandbox)."""
+
+    @abstractmethod
+    def setup(
+        self,
+        seed_dir: Path,
+        universe: str,
+        loop_prompt_template: str,
+        wheels: list[Path] | None = None,
+    ) -> None:
+        """Scaffold the project via ``lc init`` and overlay task seed files."""
+
+    @abstractmethod
+    def exec(self, cmd: str, timeout: int = 300, cwd: str | None = None) -> ExecuteResult:
+        """Run a command in the sandbox, returning its exit code and output."""
+
+    @abstractmethod
+    def exec_async_poll(
+        self, cmd: str, timeout: int = 600, poll_interval: int = 10
+    ) -> ExecuteResult:
+        """Run a long-running command, tolerant of gateway timeouts."""
+
+    @abstractmethod
+    def upload_file(self, remote_path: str, content: bytes) -> None:
+        """Upload a file into the sandbox at ``remote_path``."""
+
+    @abstractmethod
+    def teardown(self) -> None:
+        """Destroy the substrate. Idempotent."""
diff --git a/src/lightcone/eval/backends/local_docker.py b/src/lightcone/eval/backends/local_docker.py
new file mode 100644
index 00000000..4feffb0b
--- /dev/null
+++ b/src/lightcone/eval/backends/local_docker.py
@@ -0,0 +1,378 @@
+"""Local-Docker sandbox backend for the eval harness.
+
+Runs each trial in a local Docker container — the on-host/CI counterpart of the
+Daytona :class:`lightcone.eval.sandbox.EvalSandbox`. Drives the ``docker`` CLI
+via ``subprocess`` (no docker SDK), and mirrors ``EvalSandbox``'s public surface
+and ``setup()`` body so harnesses run unchanged.
+
+The image (``lc-eval-local:latest``) bakes in every registered harness's agent
+CLI plus the third-party Python deps the Daytona image pre-installs; it is built
+once and cached by tag. Auth credentials are copied in per-container (``docker
+cp`` + ``chown``) rather than bind-mounted, because the host credential files are
+``0600`` and host-uid-owned and would not be readable by the container's
+``evaluser``.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import re
+import shlex
+import subprocess
+import tempfile
+from pathlib import Path
+
+from lightcone.eval.backends.base import ExecuteResult, Sandbox
+from lightcone.eval.harnesses import available_harnesses, get_harness
+from lightcone.eval.harnesses.base import Harness
+
+logger = logging.getLogger(__name__)
+
+#: Shared image tag — built once, reused across trials.
+IMAGE_TAG = "lc-eval-local:latest"
+
+#: Third-party Python deps the Daytona image pre-installs system-wide. Kept
+#: verbatim in sync with ``EvalSandbox.create``'s ``deps`` so the local image
+#: resolves ``lc`` / ``astra`` identically (the lightcone-cli wheel is installed
+#: ``--no-deps`` at setup time, so every runtime dep it needs is listed here).
+DEPS = (
+    "astra-tools astra-spec"
+    " jinja2 jsonschema"
+    " snakemake snakemake-interface-executor-plugins"
+    " snakemake-interface-common dask distributed"
+)
+
+#: Per-harness host auth files to copy into the container, keyed by harness name.
+#: Each entry is ``(host_path, container_path)``; host paths may use ``~``.
+#: Missing host files are skipped (a harness without credentials is *skipped*,
+#: not failed — mirrors ``credential_env_keys`` semantics).
+CREDENTIAL_FILES: dict[str, list[tuple[str, str]]] = {
+    # claude authenticates purely via CLAUDE_CODE_OAUTH_TOKEN (forwarded env) on
+    # top of the image's onboarding file. We deliberately do NOT copy the host
+    # ~/.claude.json: on macOS it lacks the token anyway (keychain holds it), and
+    # it drags Cail's MCP/project state into every trial container.
+    "claude": [],
+    "codex": [("~/.codex/auth.json", "/home/evaluser/.codex/auth.json")],
+    "pi": [
+        ("~/.pi/agent/auth.json", "/home/evaluser/.pi/agent/auth.json"),
+        ("~/.pi/agent/models.json", "/home/evaluser/.pi/agent/models.json"),
+        ("~/.pi/agent/settings.json", "/home/evaluser/.pi/agent/settings.json"),
+    ],
+}
+
+
+def _sanitize(name: str) -> str:
+    """Coerce ``name`` to the docker container-name charset ``[a-zA-Z0-9_.-]``."""
+    return re.sub(r"[^a-zA-Z0-9_.-]", "-", name)
+
+
+def _install_lines() -> list[str]:
+    """One ``RUN`` line per registered harness's install commands, adapted so the
+    agent CLI lands on ``PATH`` for ``evaluser``.
+
+    Two install shapes exist among the harnesses:
+
+    - **npm globals** (codex, pi): run as root (the default ``RUN`` user) so the
+      binary lands in ``/usr/local/bin`` — on ``PATH`` for every user.
+    - **Claude's curl installer**, whose command ends ``&& cp /root/.local/bin/
+      claude /usr/local/bin/claude`` (assuming a root install): the ``cp`` source
+      doesn't exist when run as ``evaluser``. We rewrite it to install as
+      ``evaluser`` (so its config lands under ``/home/evaluser``) and symlink from
+      ``/home/evaluser/.local/bin`` into ``/usr/local/bin`` as root.
+    """
+    lines: list[str] = []
+    for hname in available_harnesses():
+        for cmd in get_harness(hname).install_commands():
+            # Claude idiom: `<installer> && cp /root/.local/bin/<x> /usr/local/bin/<x>`.
+            m = re.search(r"cp\s+/root/\.local/bin/(\S+)\s+/usr/local/bin/\S+", cmd)
+            if m:
+                binary = m.group(1)
+                installer = cmd[: m.start()].rstrip(" &")
+                lines.append(f"RUN su - evaluser -c {shlex.quote(installer)}")
+                lines.append(
+                    f"RUN ln -sf /home/evaluser/.local/bin/{binary} /usr/local/bin/{binary}"
+                )
+            else:
+                # npm -g and friends: root install → /usr/local/bin.
+                lines.append(f"RUN {cmd}")
+    return lines
+
+
+def _build_dockerfile() -> str:
+    """Generate the Dockerfile for the shared eval image (all harnesses baked in)."""
+    install = "\n".join(_install_lines())
+    # Node 22 (NodeSource) — pi (`@earendil-works/pi-coding-agent`) needs
+    # node >=22.19.0; Debian slim's `nodejs` apt package is v20 and crashes pi at
+    # runtime (`webidl.util.markAsUncloneable is not a function`). codex and
+    # claude are unaffected, but we install one modern Node for all npm globals.
+    return f"""FROM python:3.12-slim
+RUN apt-get update && apt-get install -y git curl bash sudo jq \\
+    && curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \\
+    && apt-get install -y nodejs \\
+    && rm -rf /var/lib/apt/lists/*
+RUN useradd -m -s /bin/bash evaluser \\
+    && echo 'evaluser ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+RUN pip install --no-cache-dir {DEPS}
+{install}
+RUN mkdir -p /home/evaluser/.claude \\
+    && echo '{{"hasCompletedOnboarding": true}}' > /home/evaluser/.claude.json \\
+    && chown -R evaluser:evaluser /home/evaluser/.claude /home/evaluser/.claude.json
+"""
+
+
+class LocalDockerSandbox(Sandbox):
+    """Runs one eval trial inside a local Docker container.
+
+    The container is started detached (``sleep infinity``) and each ``exec`` is a
+    fresh ``docker exec``; ``teardown`` removes it. The image is shared across
+    trials and built once (cached by tag).
+    """
+
+    def __init__(
+        self,
+        task_id: str,
+        trial_id: str,
+        harness: Harness,
+        env_vars: dict[str, str],
+    ) -> None:
+        self.task_id = task_id
+        self.trial_id = trial_id
+        self.harness = harness
+        self.env_vars = env_vars
+        self.cname = _sanitize(f"lc-eval-{trial_id}")
+
+    # ------------------------------------------------------------------ lifecycle
+
+    def create(self) -> None:
+        """Build the shared image (if not cached) and start the trial container."""
+        self._ensure_image()
+
+        env_args: list[str] = []
+        for key, val in self.env_vars.items():
+            env_args += ["-e", f"{key}={val}"]
+        # Forward host credentials this harness declares, when set in the env.
+        for key in self.harness.credential_env_keys():
+            val = os.environ.get(key)
+            if val:
+                env_args += ["-e", f"{key}={val}"]
+
+        self._run(
+            [
+                "docker", "run", "-d",
+                "--name", self.cname,
+                "-u", "evaluser",
+                *env_args,
+                "-w", "/home/evaluser",
+                IMAGE_TAG,
+                "sleep", "infinity",
+            ]
+        )
+        logger.info("Started container %s for trial %s", self.cname, self.trial_id)
+
+        self._copy_credentials()
+
+    def _ensure_image(self) -> None:
+        """Build ``lc-eval-local:latest`` unless already present (tag cache)."""
+        present = subprocess.run(
+            ["docker", "image", "inspect", IMAGE_TAG],
+            capture_output=True,
+        )
+        if present.returncode == 0:
+            logger.info("Reusing cached eval image %s", IMAGE_TAG)
+            return
+
+        dockerfile = _build_dockerfile()
+        logger.info("Building eval image %s …", IMAGE_TAG)
+        proc = subprocess.run(
+            ["docker", "build", "-t", IMAGE_TAG, "-f", "-", "."],
+            input=dockerfile,
+            text=True,
+            capture_output=True,
+        )
+        for line in (proc.stdout + proc.stderr).splitlines():
+            logger.info("[image build] %s", line.rstrip())
+        if proc.returncode != 0:
+            raise RuntimeError(f"Failed to build {IMAGE_TAG} (exit {proc.returncode})")
+        logger.info("Built eval image %s", IMAGE_TAG)
+
+    def _copy_credentials(self) -> None:
+        """Copy host auth files into the container (``docker cp`` + ``chown``).
+
+        Bind mounts won't work: host credential files are ``0600`` and
+        host-uid-owned, unreadable by the container's ``evaluser``. So we copy
+        them in and re-own to ``evaluser``. Missing host files are skipped.
+        """
+        for host_raw, container_path in CREDENTIAL_FILES.get(self.harness.name, []):
+            host_path = Path(host_raw).expanduser()
+            if not host_path.is_file():
+                continue
+            parent = os.path.dirname(container_path)
+            self._run(["docker", "exec", "-u", "evaluser", self.cname, "mkdir", "-p", parent])
+            self._run(["docker", "cp", str(host_path), f"{self.cname}:{container_path}"])
+            self._run(
+                ["docker", "exec", "-u", "root", self.cname,
+                 "chown", "-R", "evaluser:evaluser", container_path]
+            )
+
+    def teardown(self) -> None:
+        """Force-remove the container. Idempotent (errors ignored)."""
+        subprocess.run(
+            ["docker", "rm", "-f", self.cname],
+            capture_output=True,
+        )
+
+    # ------------------------------------------------------------------ exec / io
+
+    def exec(self, cmd: str, timeout: int = 300, cwd: str | None = None) -> ExecuteResult:
+        """Run ``cmd`` as ``evaluser`` via ``docker exec ... bash -lc``."""
+        full = f"cd {shlex.quote(cwd)} && {cmd}" if cwd else cmd
+        try:
+            proc = subprocess.run(
+                ["docker", "exec", "-u", "evaluser", self.cname, "bash", "-lc", full],
+                capture_output=True,
+                text=True,
+                timeout=timeout,
+            )
+        except subprocess.TimeoutExpired:
+            return ExecuteResult(124, f"Timed out after {timeout}s")
+        return ExecuteResult(proc.returncode, proc.stdout + proc.stderr)
+
+    def exec_async_poll(
+        self, cmd: str, timeout: int = 600, poll_interval: int = 10
+    ) -> ExecuteResult:
+        """Run a long command. Locally a blocking ``docker exec`` is fine — there
+        is no gateway timeout to drop the connection — so delegate to ``exec``."""
+        return self.exec(cmd, timeout=timeout)
+
+    def upload_file(self, remote_path: str, content: bytes) -> None:
+        """Write ``content`` to ``remote_path`` inside the container."""
+        with tempfile.NamedTemporaryFile(delete=False) as tmp:
+            tmp.write(content)
+            tmp_path = tmp.name
+        try:
+            parent = os.path.dirname(remote_path)
+            if parent:
+                self._run(
+                    ["docker", "exec", "-u", "evaluser", self.cname, "mkdir", "-p", parent]
+                )
+            self._run(["docker", "cp", tmp_path, f"{self.cname}:{remote_path}"])
+            self._run(
+                ["docker", "exec", "-u", "root", self.cname,
+                 "chown", "evaluser:evaluser", remote_path]
+            )
+        finally:
+            os.unlink(tmp_path)
+
+    # ------------------------------------------------------------------ setup
+
+    def setup(
+        self,
+        seed_dir: Path,
+        universe: str,
+        loop_prompt_template: str,
+        wheels: list[Path] | None = None,
+    ) -> None:
+        """Scaffold the project via ``lc init`` and overlay task seed files.
+
+        Faithful to :meth:`EvalSandbox.setup`: write the global config, run
+        ``lc init --no-git --no-venv``, overlay the task seed dir, regenerate the
+        baseline universe, stage the loop prompt, then ``git init`` + seed commit.
+        Wheels are installed system-wide (via ``sudo``) so ``lc`` / ``astra``
+        resolve before they're invoked.
+        """
+        if wheels:
+            self._install_wheels(wheels)
+
+        # Pin the global config explicitly for reproducibility (lc would
+        # auto-create it, but writing it makes the runtime deterministic).
+        self.exec(
+            "mkdir -p ~/.lightcone"
+            " && printf 'container:\\n  runtime: auto\\n' > ~/.lightcone/config.yaml"
+        )
+
+        # Scaffold from the wheel under test. --no-venv: deps are system-wide.
+        # --no-git: we git init below, after the overlay, so the seed commit
+        # captures the task files too.
+        result = self.exec(
+            f"mkdir -p {self.WORK_DIR} && lc init {self.WORK_DIR} --no-git --no-venv",
+            timeout=120,
+        )
+        if result.exit_code != 0:
+            raise RuntimeError(
+                f"`lc init` failed (exit {result.exit_code}):\n{result.output[-2000:]}"
+            )
+
+        # Overlay task seed files (astra.yaml, data/, task.yaml).
+        self._upload_directory(seed_dir, self.WORK_DIR)
+
+        # Regenerate baseline.yaml from the task astra.yaml's defaults (the one
+        # lc init produced matches the boilerplate astra.yaml, now replaced).
+        self.exec(
+            f"cd {self.WORK_DIR}"
+            f" && rm -f universes/baseline.yaml"
+            f" && astra universe generate -n baseline"
+            f" -d 'Default configuration using standard practices'",
+            timeout=60,
+        )
+
+        # Template and stage the loop prompt for the agent.
+        prompt = loop_prompt_template.replace("{{UNIVERSE}}", universe)
+        self.upload_file("/tmp/loop-prompt.md", prompt.encode())
+
+        # Initial commit captures the full project state.
+        self.exec(
+            f"cd {self.WORK_DIR}"
+            " && git config --global user.name Eval"
+            " && git config --global user.email eval@lightcone"
+            " && git init -q && git add -A && git commit -q -m 'seed'"
+        )
+
+    def _install_wheels(self, wheels: list[Path]) -> None:
+        """Upload the lightcone-cli wheel(s) and install them system-wide.
+
+        ``--no-deps`` (every runtime dep is baked into the image) and
+        ``--force-reinstall`` (the local wheel always overrides any PyPI version).
+        ``sudo`` so the install is system-wide and ``lc`` / ``astra`` resolve for
+        ``evaluser`` — mirroring the Daytona image's root-installed deps.
+        """
+        self.exec("mkdir -p /tmp/deps")
+
+        remote_paths: list[str] = []
+        for whl in wheels:
+            remote_path = f"/tmp/deps/{whl.name}"
+            self.upload_file(remote_path, whl.read_bytes())
+            remote_paths.append(remote_path)
+
+        whl_cmd = (
+            "sudo pip install --no-deps --force-reinstall "
+            + " ".join(shlex.quote(p) for p in remote_paths)
+        )
+        result = self.exec(whl_cmd, timeout=120)
+        if result.exit_code != 0:
+            logger.warning(
+                "Failed to install wheels (exit %d):\n...%s",
+                result.exit_code,
+                result.output[-2000:],
+            )
+        else:
+            logger.info("Installed wheels: %s", [w.name for w in wheels])
+
+    def _upload_directory(self, local_dir: Path, remote_dir: str) -> None:
+        """Upload a local directory tree into the container."""
+        for local_path in local_dir.rglob("*"):
+            if local_path.is_file():
+                rel = local_path.relative_to(local_dir)
+                self.upload_file(f"{remote_dir}/{rel}", local_path.read_bytes())
+
+    # ------------------------------------------------------------------ helpers
+
+    def _run(self, args: list[str]) -> subprocess.CompletedProcess[str]:
+        """Run a docker CLI command, raising with captured output on failure."""
+        proc = subprocess.run(args, capture_output=True, text=True)
+        if proc.returncode != 0:
+            raise RuntimeError(
+                f"`{' '.join(args)}` failed (exit {proc.returncode}):\n"
+                f"{(proc.stdout + proc.stderr)[-2000:]}"
+            )
+        return proc
diff --git a/src/lightcone/eval/cli.py b/src/lightcone/eval/cli.py
index f1a253a1..c49c097b 100644
--- a/src/lightcone/eval/cli.py
+++ b/src/lightcone/eval/cli.py
@@ -53,7 +53,12 @@ def run_cmd(
         lc eval run evals/example-run.yaml --num-trials 1 --concurrency 2
     """
     from lightcone.eval.harness import load_run_config, run_eval
-    from lightcone.eval.report import compute_summary, print_comparison_table, save_results
+    from lightcone.eval.report import (
+        compute_summary,
+        print_comparison_table,
+        print_matrix_table,
+        save_results,
+    )
 
     config = load_run_config(config_path)
 
@@ -99,7 +104,10 @@ def _on_trial_complete(trial: object) -> None:
     if dry_run:
         schedule = eval_run.summary.get("schedule", [])
         for s in schedule:
-            console.print(f"  {s['task']} trial {s['trial']}")
+            skills = "skills" if s.get("skills") else "bare"
+            console.print(
+                f"  {s['task']} · {s.get('harness', '?')} · {skills} · trial {s['trial']}"
+            )
         console.print(f"\n[bold]Total: {eval_run.summary.get('total_trials', 0)} trials[/bold]")
         return
 
@@ -117,6 +125,8 @@ def _on_trial_complete(trial: object) -> None:
     eval_run.summary = compute_summary(eval_run)
     console.print()
     print_comparison_table(eval_run, console=console)
+    console.print()
+    print_matrix_table(eval_run, console=console)
 
     # Save results
     output_path = save_results(eval_run, config.output_dir)
diff --git a/src/lightcone/eval/harness.py b/src/lightcone/eval/harness.py
index 138b8220..fae29616 100644
--- a/src/lightcone/eval/harness.py
+++ b/src/lightcone/eval/harness.py
@@ -17,15 +17,44 @@
 
 from lightcone.eval.build import build_eval_wheels
 from lightcone.eval.graders import compute_composite_score, run_graders
+from lightcone.eval.harnesses import Harness, get_harness
 from lightcone.eval.models import (
     EvalRun,
     EvalRunConfig,
+    HarnessSpec,
     IterationResult,
     TaskSpec,
     TrialResult,
 )
 from lightcone.eval.sandbox import BUILD_COMPLETE_MARKER, EvalSandbox
 
+
+def _make_sandbox(
+    config: EvalRunConfig,
+    task: TaskSpec,
+    trial_id: str,
+    harness: Harness,
+    env_vars: dict[str, str],
+) -> Any:
+    """Construct the trial's sandbox for the configured backend.
+
+    ``local_docker`` runs the agent in a throwaway local container (the default
+    for local demos); ``daytona`` is the original cloud-sandbox path used in CI.
+    Both satisfy the ``SandboxLike`` protocol the harness drives.
+    """
+    if config.backend == "local_docker":
+        from lightcone.eval.backends.local_docker import LocalDockerSandbox
+
+        return LocalDockerSandbox(
+            task_id=task.id, trial_id=trial_id, harness=harness, env_vars=env_vars
+        )
+    return EvalSandbox(
+        task_id=task.id,
+        trial_id=trial_id,
+        sandbox_image=config.sandbox_image,
+        env_vars=env_vars,
+    )
+
 logger = logging.getLogger(__name__)
 
 DEFAULT_LOOP_PROMPT = """\
@@ -83,32 +112,38 @@ def run_trial(
     task: TaskSpec,
     trial_number: int,
     *,
+    harness_spec: HarnessSpec,
+    with_skills: bool,
     evals_dir: Path,
     config: EvalRunConfig,
     run_id: str,
     wheels: list[Path],
     sidecar_dir: Path | None = None,
 ) -> TrialResult:
-    """Run a single trial: create sandbox -> run the build prompt -> grade -> teardown."""
-    trial_id = f"{run_id}-{task.id}-{trial_number}"
+    """Run one trial: create sandbox -> prepare -> run the agent -> grade -> teardown."""
+    variant = "skills" if with_skills else "bare"
+    trial_id = f"{run_id}-{task.id}-{harness_spec.name}-{variant}-{trial_number}"
     trial = TrialResult(
         trial_id=trial_id,
         task_id=task.id,
+        harness=harness_spec.name,
+        with_skills=with_skills,
         trial_number=trial_number,
         started_at=datetime.now(UTC),
     )
 
+    harness = get_harness(harness_spec.name)
+    # The local-Docker backend only forwards what we pass here (it doesn't
+    # inject eval metadata the way EvalSandbox does), so set it explicitly.
     env_vars = {
+        "LIGHTCONE_EVAL": "true",
         "LIGHTCONE_EVAL_RUN_ID": run_id,
+        "LIGHTCONE_EVAL_TRIAL_ID": trial_id,
+        "LIGHTCONE_EVAL_TASK_ID": task.id,
         "CLAUDE_CODE_SESSION_ID": f"eval-{trial_id}",
     }
 
-    sandbox = EvalSandbox(
-        task_id=task.id,
-        trial_id=trial_id,
-        sandbox_image=config.sandbox_image,
-        env_vars=env_vars,
-    )
+    sandbox = _make_sandbox(config, task, trial_id, harness, env_vars)
 
     try:
         sandbox.create()
@@ -123,13 +158,22 @@ def run_trial(
             wheels=wheels,
         )
 
+        # With/without-Lightcone A/B: prepare strips the scaffold for the bare
+        # arm (or no-ops for harnesses that gate skills via invoke flags).
+        harness.prepare(sandbox, work_dir=sandbox.WORK_DIR, with_skills=with_skills)
+
         # Single invocation with a high max-turns budget — the prompt is
         # self-contained and the agent loops over outputs internally.
         start = time.monotonic()
         try:
-            claude_result = sandbox.exec_claude(
-                max_turns=task.max_turns,
-                timeout=task.trial_timeout,
+            claude_result = harness.invoke(
+                sandbox,
+                prompt_path="/tmp/loop-prompt.md",
+                work_dir=sandbox.WORK_DIR,
+                max_turns=config.max_turns or task.max_turns,
+                timeout=config.trial_timeout or task.trial_timeout,
+                with_skills=with_skills,
+                model=harness_spec.model,
             )
             duration = time.monotonic() - start
 
@@ -198,11 +242,20 @@ def run_eval(
     # Load all tasks
     tasks = [load_task(evals_dir, tid) for tid in config.tasks]
 
-    # Build trial schedule
+    # Build trial schedule: tasks x harnesses x skill_variants x num_trials
     schedule: list[dict[str, Any]] = []
     for task in tasks:
-        for n in range(config.num_trials):
-            schedule.append({"task": task, "trial_number": n})
+        for hspec in config.harnesses:
+            for with_skills in config.skill_variants:
+                for n in range(config.num_trials):
+                    schedule.append(
+                        {
+                            "task": task,
+                            "harness_spec": hspec,
+                            "with_skills": with_skills,
+                            "trial_number": n,
+                        }
+                    )
 
     if dry_run:
         return EvalRun(
@@ -210,7 +263,12 @@ def run_eval(
             started_at=datetime.now(UTC),
             finished_at=datetime.now(UTC),
             summary={"dry_run": True, "total_trials": len(schedule), "schedule": [
-                {"task": s["task"].id, "trial": s["trial_number"]}
+                {
+                    "task": s["task"].id,
+                    "harness": s["harness_spec"].name,
+                    "skills": s["with_skills"],
+                    "trial": s["trial_number"],
+                }
                 for s in schedule
             ]},
         )
@@ -252,6 +310,8 @@ def _signal_handler(signum: int, frame: Any) -> None:
                     run_trial,
                     s["task"],
                     s["trial_number"],
+                    harness_spec=s["harness_spec"],
+                    with_skills=s["with_skills"],
                     evals_dir=evals_dir,
                     config=config,
                     run_id=run_id,
@@ -272,6 +332,8 @@ def _signal_handler(signum: int, frame: Any) -> None:
                     trial = TrialResult(
                         trial_id=f"{run_id}-error",
                         task_id=s["task"].id,
+                        harness=s["harness_spec"].name,
+                        with_skills=s["with_skills"],
                         trial_number=s["trial_number"],
                         error=str(exc),
                     )
diff --git a/src/lightcone/eval/harnesses/__init__.py b/src/lightcone/eval/harnesses/__init__.py
new file mode 100644
index 00000000..dc2623eb
--- /dev/null
+++ b/src/lightcone/eval/harnesses/__init__.py
@@ -0,0 +1,50 @@
+"""Harness registry — maps a harness id to its :class:`Harness` implementation.
+
+To add a harness: create ``harnesses/<name>.py`` with a ``Harness`` subclass,
+import it here, and add it to ``_REGISTRY``. The eval run config selects
+harnesses by id (``harnesses: [claude, codex, pi]``).
+"""
+
+from __future__ import annotations
+
+from lightcone.eval.harnesses.base import (
+    AgentResult,
+    CommandResult,
+    Harness,
+    SandboxLike,
+)
+from lightcone.eval.harnesses.claude import ClaudeHarness
+from lightcone.eval.harnesses.codex import CodexHarness
+from lightcone.eval.harnesses.pi import PiHarness
+
+_REGISTRY: dict[str, type[Harness]] = {
+    ClaudeHarness.name: ClaudeHarness,
+    CodexHarness.name: CodexHarness,
+    PiHarness.name: PiHarness,
+}
+
+
+def get_harness(name: str) -> Harness:
+    """Instantiate the harness registered under ``name``."""
+    try:
+        return _REGISTRY[name]()
+    except KeyError:
+        raise ValueError(
+            f"Unknown harness {name!r}; known harnesses: {sorted(_REGISTRY)}"
+        ) from None
+
+
+def available_harnesses() -> list[str]:
+    """Sorted list of registered harness ids."""
+    return sorted(_REGISTRY)
+
+
+__all__ = [
+    "AgentResult",
+    "ClaudeHarness",
+    "CommandResult",
+    "Harness",
+    "SandboxLike",
+    "available_harnesses",
+    "get_harness",
+]
diff --git a/src/lightcone/eval/harnesses/base.py b/src/lightcone/eval/harnesses/base.py
new file mode 100644
index 00000000..5eeda421
--- /dev/null
+++ b/src/lightcone/eval/harnesses/base.py
@@ -0,0 +1,131 @@
+"""Harness abstraction: one agent CLI driven headlessly inside a Sandbox.
+
+A ``Harness`` is the single seam where the eval becomes agent-specific. Before
+this layer, ``EvalSandbox.exec_claude`` hardwired Claude Code into the trial
+loop. A harness declares four things and nothing more:
+
+1. how to **install** its agent CLI into the eval image,
+2. which host **credentials** to forward for auth,
+3. optional per-trial project **prepare** (e.g. strip the Lightcone scaffold for
+   the without-skills arm of the A/B), and
+4. how to **invoke** the agent headlessly, returning a parsed ``AgentResult``.
+
+Everything downstream stays harness-agnostic: graders read the materialized
+filesystem (``lc status --json``, ``astra validate``), never the agent. The
+concrete per-harness commands live in the ``harness-invocation-matrix`` fiber.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Protocol, runtime_checkable
+
+
+@dataclass
+class AgentResult:
+    """Parsed result of one headless agent invocation, normalized across
+    harnesses.
+
+    ``cost_usd`` is best-effort: harnesses that do not price a run (e.g. codex)
+    report ``0.0``. ``raw_jsonl`` keeps the agent's full stdout for the trial
+    transcript sidecar.
+    """
+
+    cost_usd: float = 0.0
+    num_turns: int = 0
+    duration_ms: int = 0
+    result_text: str = ""
+    is_error: bool = False
+    raw_jsonl: str = ""
+
+
+@runtime_checkable
+class CommandResult(Protocol):
+    """Anything with an exit code and captured output — e.g. the sandbox
+    backends' ``ExecuteResult``."""
+
+    exit_code: int
+    output: str
+
+
+@runtime_checkable
+class SandboxLike(Protocol):
+    """The execution substrate a ``Harness`` drives.
+
+    Both ``DaytonaSandbox`` and ``LocalDockerSandbox`` satisfy this. A harness
+    uses only these primitives, so it never depends on a specific backend.
+    """
+
+    WORK_DIR: str
+
+    def exec(self, cmd: str, timeout: int = 300, cwd: str | None = None) -> CommandResult: ...
+
+    def exec_async_poll(
+        self, cmd: str, timeout: int = 600, poll_interval: int = 10
+    ) -> CommandResult: ...
+
+    def upload_file(self, remote_path: str, content: bytes) -> None: ...
+
+
+class Harness(ABC):
+    """One agent CLI, driven headlessly inside a sandbox.
+
+    Subclasses set ``name`` (the registry id) and implement install /
+    credentials / invoke. ``prepare`` has a sensible default — strip the
+    Lightcone scaffold for the without-skills arm — that file-scaffold harnesses
+    (Claude) inherit and flag-based harnesses (pi) may override.
+    """
+
+    #: registry id, e.g. "claude" / "codex" / "pi"
+    name: str = ""
+
+    @abstractmethod
+    def install_commands(self) -> list[str]:
+        """Shell commands (run as root at image-build time) that install the
+        agent CLI onto ``PATH``. Consumed by every sandbox backend's image
+        build so the binary is present before a trial starts."""
+
+    @abstractmethod
+    def credential_env_keys(self) -> list[str]:
+        """Host env var names to forward into the sandbox for auth. Missing keys
+        are skipped — a harness with no credentials is reported *skipped*, not
+        failed."""
+
+    @abstractmethod
+    def invoke(
+        self,
+        sandbox: SandboxLike,
+        *,
+        prompt_path: str,
+        work_dir: str,
+        max_turns: int,
+        timeout: int,
+        with_skills: bool,
+        model: str | None = None,
+    ) -> AgentResult:
+        """Run the agent headlessly against the project in ``work_dir``, reading
+        the loop prompt from ``prompt_path`` inside the sandbox.
+
+        ``with_skills`` lets harnesses with native skill flags (pi's
+        ``--skill`` / ``--no-skills``) toggle skill loading at invoke time;
+        file-scaffold harnesses rely on ``prepare`` having stripped the scaffold
+        instead, and can ignore it.
+        """
+
+    def prepare(
+        self, sandbox: SandboxLike, *, work_dir: str, with_skills: bool
+    ) -> None:
+        """Per-trial project prep before invocation.
+
+        Default: for the without-skills arm, strip the Lightcone scaffold so the
+        agent runs bare (``.claude/`` skills+hooks, ``CLAUDE.md``, and any
+        sibling agent-context files). The ``lc`` engine is intentionally left in
+        place — the A/B isolates the *guidance layer*, not the execution
+        substrate. Harnesses that gate skills via invoke flags may override.
+        """
+        if not with_skills:
+            sandbox.exec(
+                f"cd {work_dir} && rm -rf .claude CLAUDE.md AGENTS.md GEMINI.md",
+                timeout=30,
+            )
diff --git a/src/lightcone/eval/harnesses/claude.py b/src/lightcone/eval/harnesses/claude.py
new file mode 100644
index 00000000..ab4ade52
--- /dev/null
+++ b/src/lightcone/eval/harnesses/claude.py
@@ -0,0 +1,93 @@
+"""Claude Code harness — ``claude -p`` headless (stream-json)."""
+
+from __future__ import annotations
+
+import json
+import shlex
+import time
+
+from lightcone.eval.harnesses.base import AgentResult, Harness, SandboxLike
+
+
+class ClaudeHarness(Harness):
+    """Drives Claude Code. This is the original (and reference) seam: the trial
+    loop used to call this logic directly via ``EvalSandbox.exec_claude``."""
+
+    name = "claude"
+
+    def install_commands(self) -> list[str]:
+        return [
+            "curl -fsSL https://claude.ai/install.sh | bash"
+            " && cp /root/.local/bin/claude /usr/local/bin/claude",
+        ]
+
+    def credential_env_keys(self) -> list[str]:
+        return ["CLAUDE_CODE_OAUTH_TOKEN", "ANTHROPIC_API_KEY"]
+
+    def invoke(
+        self,
+        sandbox: SandboxLike,
+        *,
+        prompt_path: str,
+        work_dir: str,
+        max_turns: int,
+        timeout: int,
+        with_skills: bool,
+        model: str | None = None,
+    ) -> AgentResult:
+        # Claude Code consumes skills from the scaffolded `.claude/` dir, so the
+        # with/without-skills split is handled by `prepare` (file strip), not a
+        # flag here. `with_skills` is accepted for interface uniformity.
+        model_flag = f"--model {shlex.quote(model)}" if model else ""
+        cmd = (
+            f"cd {work_dir} && "
+            f'claude -p "$(cat {shlex.quote(prompt_path)})" '
+            f"--output-format stream-json --verbose "
+            f"--dangerously-skip-permissions "
+            f"--max-turns {max_turns} "
+            f"{model_flag}"
+        ).strip()
+
+        start = time.monotonic()
+        result = sandbox.exec_async_poll(cmd, timeout=timeout)
+        duration_ms = int((time.monotonic() - start) * 1000)
+        return parse_claude_output(result.output, result.exit_code, duration_ms)
+
+
+def parse_claude_output(
+    raw_output: str, exit_code: int, duration_ms: int
+) -> AgentResult:
+    """Parse JSONL from ``claude -p --output-format stream-json``.
+
+    One JSON object per line; the final ``{"type": "result", ...}`` line carries
+    the aggregate metrics.
+    """
+    result = AgentResult(duration_ms=duration_ms, raw_jsonl=raw_output)
+
+    if exit_code != 0:
+        result.is_error = True
+        result.result_text = raw_output
+        return result
+
+    for raw_line in reversed(raw_output.strip().splitlines()):
+        stripped = raw_line.strip()
+        if not stripped or not stripped.startswith("{"):
+            continue
+        try:
+            data = json.loads(stripped)
+            if data.get("type") == "result":
+                result.cost_usd = float(
+                    data.get("cost_usd", data.get("total_cost_usd", 0.0))
+                )
+                result.num_turns = int(data.get("num_turns", 0))
+                result.duration_ms = int(data.get("duration_ms", duration_ms))
+                result.result_text = str(data.get("result", ""))
+                result.is_error = bool(data.get("is_error", False))
+                return result
+        except (json.JSONDecodeError, ValueError):
+            continue
+
+    # No result line found
+    result.result_text = raw_output
+    result.is_error = True
+    return result
diff --git a/src/lightcone/eval/harnesses/codex.py b/src/lightcone/eval/harnesses/codex.py
new file mode 100644
index 00000000..189b8287
--- /dev/null
+++ b/src/lightcone/eval/harnesses/codex.py
@@ -0,0 +1,104 @@
+"""OpenAI Codex CLI harness — ``codex exec`` headless (JSONL)."""
+
+from __future__ import annotations
+
+import json
+import shlex
+import time
+
+from lightcone.eval.harnesses.base import AgentResult, Harness, SandboxLike
+
+
+class CodexHarness(Harness):
+    """Drives the OpenAI Codex CLI. Mirrors :class:`ClaudeHarness`: a headless
+    ``codex exec`` run inside the sandbox, parsed into an ``AgentResult``."""
+
+    name = "codex"
+
+    def install_commands(self) -> list[str]:
+        # The eval base image provides node/npm; the local binary is a brew cask
+        # but npm is the container path.
+        return ["npm install -g @openai/codex"]
+
+    def credential_env_keys(self) -> list[str]:
+        return ["OPENAI_API_KEY", "CODEX_API_KEY"]
+
+    # `prepare` is inherited from the base default (strip the scaffold for the
+    # without-skills arm). Note: Codex reads `AGENTS.md`, not `.claude/skills`,
+    # so "with-skills" guidance never actually reaches Codex — a known
+    # limitation tracked for LCR-85; we do not synthesize AGENTS.md here.
+
+    def invoke(
+        self,
+        sandbox: SandboxLike,
+        *,
+        prompt_path: str,
+        work_dir: str,
+        max_turns: int,
+        timeout: int,
+        with_skills: bool,
+        model: str | None = None,
+    ) -> AgentResult:
+        # Codex has no native max-turns; the run is bounded by `timeout` only.
+        # `with_skills` is accepted for interface uniformity (see `prepare`).
+        model_flag = f"-m {shlex.quote(model)}" if model else ""
+        cmd = (
+            f"codex exec --cd {shlex.quote(work_dir)} "
+            f"--dangerously-bypass-approvals-and-sandbox "
+            f"--skip-git-repo-check --json "
+            f"{model_flag} "
+            f"-o /tmp/codex-last-message.txt "
+            f'"$(cat {shlex.quote(prompt_path)})"'
+        ).strip()
+
+        start = time.monotonic()
+        result = sandbox.exec_async_poll(cmd, timeout=timeout)
+        duration_ms = int((time.monotonic() - start) * 1000)
+
+        last = sandbox.exec("cat /tmp/codex-last-message.txt")
+        last_message = last.output if last.exit_code == 0 else ""
+
+        return parse_codex_output(
+            result.output, result.exit_code, duration_ms, last_message
+        )
+
+
+def parse_codex_output(
+    raw_jsonl: str, exit_code: int, duration_ms: int, last_message: str
+) -> AgentResult:
+    """Parse JSONL from ``codex exec --json``.
+
+    The event shape is unconfirmed, so this stays tolerant: bad lines are
+    skipped and never raise. Codex does not price a run, so ``cost_usd`` is
+    always ``0.0``. ``result_text`` prefers the ``-o`` last-message file,
+    falling back to the last non-empty stdout line. ``num_turns`` is a
+    best-effort count of assistant/turn events.
+    """
+    result = AgentResult(
+        cost_usd=0.0,
+        duration_ms=duration_ms,
+        is_error=exit_code != 0,
+        raw_jsonl=raw_jsonl,
+    )
+
+    num_turns = 0
+    last_nonempty_line = ""
+    for raw_line in raw_jsonl.splitlines():
+        stripped = raw_line.strip()
+        if not stripped:
+            continue
+        last_nonempty_line = stripped
+        if not stripped.startswith("{"):
+            continue
+        try:
+            data = json.loads(stripped)
+        except (json.JSONDecodeError, ValueError):
+            continue
+        # Best-effort turn detection: tolerate several plausible event shapes.
+        event_type = str(data.get("type", "") or data.get("event", ""))
+        if "turn" in event_type or "assistant" in event_type:
+            num_turns += 1
+
+    result.num_turns = num_turns
+    result.result_text = last_message or last_nonempty_line
+    return result
diff --git a/src/lightcone/eval/harnesses/pi.py b/src/lightcone/eval/harnesses/pi.py
new file mode 100644
index 00000000..43be5abb
--- /dev/null
+++ b/src/lightcone/eval/harnesses/pi.py
@@ -0,0 +1,133 @@
+"""pi harness — ``pi -p`` headless (``@earendil-works/pi-coding-agent``).
+
+Unlike Claude Code, pi gates skill loading via *invoke flags* (``--skill`` /
+``--no-skills``), not via the on-disk scaffold. So this harness keeps the
+Lightcone scaffold in place (no-op ``prepare``) and lets the with/without-skills
+flags decide what the agent sees.
+"""
+
+from __future__ import annotations
+
+import json
+import shlex
+import time
+
+from lightcone.eval.harnesses.base import AgentResult, Harness, SandboxLike
+
+
+class PiHarness(Harness):
+    """Drives the pi coding agent (``@earendil-works/pi-coding-agent``).
+
+    pi uses the process CWD as the project dir (no ``--cd`` flag), so every
+    command is prefixed with ``cd <work_dir> &&``. Skill loading is toggled at
+    invoke time, not by file-stripping.
+    """
+
+    name = "pi"
+
+    def install_commands(self) -> list[str]:
+        return ["npm install -g @earendil-works/pi-coding-agent"]
+
+    def credential_env_keys(self) -> list[str]:
+        return ["ANTHROPIC_API_KEY", "OPENAI_API_KEY", "GEMINI_API_KEY"]
+
+    def prepare(
+        self, sandbox: SandboxLike, *, work_dir: str, with_skills: bool
+    ) -> None:
+        """No-op: pi gates skills via invoke flags (``--skill`` / ``--no-skills``),
+        not by stripping the on-disk scaffold. We keep the scaffold in place and
+        let the flags in ``invoke`` decide what the agent loads."""
+
+    def invoke(
+        self,
+        sandbox: SandboxLike,
+        *,
+        prompt_path: str,
+        work_dir: str,
+        max_turns: int,
+        timeout: int,
+        with_skills: bool,
+        model: str | None = None,
+    ) -> AgentResult:
+        # pi uses the process CWD as the project dir (no `--cd`), so prefix the
+        # command with `cd <work_dir>`. `--no-session` keeps the run ephemeral.
+        if with_skills:
+            # Load the lc-init scaffold's skills dir (relative to work_dir).
+            skill_flags = "--skill .claude/skills"
+        else:
+            # Bare run: disable skill discovery and context files (CLAUDE.md etc).
+            skill_flags = "--no-skills --no-context-files"
+
+        model_flag = ""
+        if model:
+            model_flag = f"--model {shlex.quote(model)}"
+            # Heuristic: Claude-family model ids need the anthropic provider, since
+            # pi's default provider is google. Other ids fall through to whatever
+            # provider pi is configured with.
+            if any(tag in model.lower() for tag in ("claude", "sonnet", "opus", "haiku")):
+                model_flag = f"--provider anthropic {model_flag}"
+
+        cmd = (
+            f"cd {work_dir} && "
+            f'pi -p "$(cat {shlex.quote(prompt_path)})" '
+            f"--mode json --no-session "
+            f"{skill_flags} "
+            f"{model_flag}"
+        ).strip()
+
+        start = time.monotonic()
+        result = sandbox.exec_async_poll(cmd, timeout=timeout)
+        duration_ms = int((time.monotonic() - start) * 1000)
+        return parse_pi_output(result.output, result.exit_code, duration_ms)
+
+
+def parse_pi_output(raw: str, exit_code: int, duration_ms: int) -> AgentResult:
+    """Parse output from ``pi -p --mode json``.
+
+    The ``--mode json`` shape is UNCONFIRMED, so this is deliberately tolerant:
+    try ``json.loads`` on the whole stdout, then on the last non-empty line; pull
+    cost / turn / final-text fields under their plausible aliases. Never raises on
+    malformed JSON — falls back to the last non-empty line as ``result_text``.
+    """
+    result = AgentResult(duration_ms=duration_ms, raw_jsonl=raw, is_error=exit_code != 0)
+
+    lines = [ln.strip() for ln in raw.strip().splitlines() if ln.strip()]
+    last_line = lines[-1] if lines else ""
+
+    data: dict | None = None
+    for candidate in (raw.strip(), last_line):
+        if not candidate:
+            continue
+        try:
+            parsed = json.loads(candidate)
+        except (json.JSONDecodeError, ValueError):
+            continue
+        if isinstance(parsed, dict):
+            data = parsed
+            break
+
+    if data is None:
+        # No parseable JSON object — best-effort fall back to the last line.
+        result.result_text = last_line
+        return result
+
+    cost = data.get("cost", data.get("cost_usd", data.get("total_cost_usd", 0.0)))
+    try:
+        result.cost_usd = float(cost)
+    except (TypeError, ValueError):
+        result.cost_usd = 0.0
+
+    turns = data.get("num_turns", data.get("turns", 0))
+    try:
+        result.num_turns = int(turns)
+    except (TypeError, ValueError):
+        result.num_turns = 0
+
+    for key in ("result", "text", "content", "message"):
+        if key in data and data[key]:
+            result.result_text = str(data[key])
+            break
+    else:
+        result.result_text = last_line
+
+    return result
diff --git a/src/lightcone/eval/models.py b/src/lightcone/eval/models.py
index d7f9ad7f..6baa8d14 100644
--- a/src/lightcone/eval/models.py
+++ b/src/lightcone/eval/models.py
@@ -6,7 +6,18 @@
 from enum import StrEnum
 from typing import Any
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator
+
+
+class HarnessSpec(BaseModel):
+    """One agent harness in the run matrix, with an optional model override.
+
+    Accepts a bare string in YAML (``- claude``) or a mapping
+    (``{name: codex, model: gpt-5.3-codex-spark}``).
+    """
+
+    name: str
+    model: str | None = None
 
 
 class GraderType(StrEnum):
@@ -66,10 +77,12 @@ class GraderResult(BaseModel):
 
 
 class TrialResult(BaseModel):
-    """Result of a single trial (one task x repetition)."""
+    """Result of a single trial (one task x harness x skill-variant x rep)."""
 
     trial_id: str
     task_id: str
+    harness: str = "claude"
+    with_skills: bool = True
     trial_number: int = 0
     started_at: datetime | None = None
     finished_at: datetime | None = None
@@ -84,14 +97,37 @@ class TrialResult(BaseModel):
 
 
 class EvalRunConfig(BaseModel):
-    """Configuration for an eval run (loaded from run config YAML)."""
+    """Configuration for an eval run (loaded from run config YAML).
+
+    The run is a matrix: ``tasks`` x ``harnesses`` x ``skill_variants`` x
+    ``num_trials``. ``skill_variants`` is the with/without-Lightcone A/B —
+    ``[true]`` for a single arm, ``[true, false]`` to measure the lift.
+    """
 
     id: str = ""
     tasks: list[str] = Field(default_factory=list)
+    backend: str = "daytona"  # "daytona" | "local_docker"
+    harnesses: list[HarnessSpec] = Field(
+        default_factory=lambda: [HarnessSpec(name="claude")]
+    )
+    skill_variants: list[bool] = Field(default_factory=lambda: [True])
     num_trials: int = 3
     max_concurrency: int = 4
     sandbox_image: str | None = None
     output_dir: str = "eval-results"
+    # Optional run-level overrides of the task's budget — handy for cheap
+    # plumbing smokes (cap turns so a trial exits fast without a full build).
+    max_turns: int | None = None
+    trial_timeout: int | None = None
+
+    @field_validator("harnesses", mode="before")
+    @classmethod
+    def _coerce_harnesses(cls, v: object) -> object:
+        """Allow ``harnesses: [claude, {name: codex, model: ...}]`` — bare
+        strings become ``HarnessSpec(name=...)``."""
+        if isinstance(v, list):
+            return [{"name": h} if isinstance(h, str) else h for h in v]
+        return v
 
 
 class VersionInfo(BaseModel):
diff --git a/src/lightcone/eval/report.py b/src/lightcone/eval/report.py
index 42a677d7..d2bb400d 100644
--- a/src/lightcone/eval/report.py
+++ b/src/lightcone/eval/report.py
@@ -189,6 +189,99 @@ def print_comparison_between(
     console.print(table)
 
 
+def compute_matrix(eval_run: EvalRun) -> dict[tuple[str, str, bool], dict[str, Any]]:
+    """Group trials by ``(task, harness, with_skills)`` — the multi-harness A/B
+    matrix. Keys are tuples; values mirror ``compute_summary``'s per-group stats.
+    """
+    groups: dict[tuple[str, str, bool], list[TrialResult]] = defaultdict(list)
+    for t in eval_run.trials:
+        groups[(t.task_id, t.harness, t.with_skills)].append(t)
+
+    out: dict[tuple[str, str, bool], dict[str, Any]] = {}
+    for key, trials in groups.items():
+        ok = [t for t in trials if t.error is None]
+        n = len(ok)
+        scores = [t.composite_score for t in ok]
+        out[key] = {
+            "task": key[0],
+            "harness": key[1],
+            "with_skills": key[2],
+            "num_trials": len(trials),
+            "num_errors": len(trials) - n,
+            "mean_score": round(sum(scores) / n, 4) if n else 0.0,
+            "pass_at_k": (sum(1 for t in ok if t.build_complete) / len(trials))
+            if trials
+            else 0.0,
+            "mean_cost_usd": round(sum(t.total_cost_usd for t in ok) / n, 4) if n else 0.0,
+            "mean_duration_seconds": round(
+                sum(t.total_duration_seconds for t in ok) / n, 1
+            )
+            if n
+            else 0.0,
+        }
+    return out
+
+
+def _matrix_cell(g: dict[str, Any] | None) -> str:
+    """Render one harness×variant cell: score, pass@k, and cost/errors if any."""
+    if g is None:
+        return "—"
+    cell = f"{g['mean_score']:.2f}  pass@k {g['pass_at_k']:.0%}"
+    extra = []
+    if g["num_errors"]:
+        extra.append(f"[red]{g['num_errors']} err[/red]")
+    if g["mean_cost_usd"]:
+        extra.append(f"${g['mean_cost_usd']:.2f}")
+    return cell + (("\n" + " ".join(extra)) if extra else "")
+
+
+def print_matrix_table(eval_run: EvalRun, console: Console | None = None) -> None:
+    """Print the task × harness × {with,without}-Lightcone matrix.
+
+    The Δ column is the with−without **lift** — the A/B headline: how much the
+    Lightcone layer moved the score on each harness. Shown only when both arms
+    ran for a (task, harness) pair.
+    """
+    if console is None:
+        console = Console()
+
+    matrix = compute_matrix(eval_run)
+    if not matrix:
+        console.print("[yellow]No results to display.[/yellow]")
+        return
+
+    tasks = sorted({k[0] for k in matrix})
+    harnesses = sorted({k[1] for k in matrix})
+    has_both = any((t, h, True) in matrix and (t, h, False) in matrix
+                   for t in tasks for h in harnesses)
+
+    table = Table(title="Eval Matrix: score by harness × Lightcone layer", show_lines=True)
+    table.add_column("Task", style="bold")
+    table.add_column("Harness", style="bold")
+    table.add_column("with skills", justify="center")
+    table.add_column("without skills", justify="center")
+    if has_both:
+        table.add_column("Δ lift", justify="center")
+
+    for task in tasks:
+        for h in harnesses:
+            w = matrix.get((task, h, True))
+            wo = matrix.get((task, h, False))
+            if w is None and wo is None:
+                continue
+            row = [task, h, _matrix_cell(w), _matrix_cell(wo)]
+            if has_both:
+                if w and wo:
+                    delta = w["mean_score"] - wo["mean_score"]
+                    color = "green" if delta > 0 else ("red" if delta < 0 else "white")
+                    row.append(f"[{color}]{delta:+.2f}[/{color}]")
+                else:
+                    row.append("—")
+            table.add_row(*row)
+
+    console.print(table)
+
+
 def save_results(eval_run: EvalRun, output_dir: str | Path) -> Path:
     """Save full EvalRun to JSON inside the run's sidecar directory."""
     output_dir = Path(output_dir)
diff --git a/src/lightcone/eval/sandbox.py b/src/lightcone/eval/sandbox.py
index 6be41097..ac129edc 100644
--- a/src/lightcone/eval/sandbox.py
+++ b/src/lightcone/eval/sandbox.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import json
 import logging
 import os
 import shlex
@@ -11,10 +10,23 @@
 from pathlib import Path
 from typing import Any
 
+from lightcone.eval.harnesses.base import AgentResult
+from lightcone.eval.harnesses.claude import parse_claude_output
+
 logger = logging.getLogger(__name__)
 
 BUILD_COMPLETE_MARKER = "BUILD_COMPLETE"
 
+# Idle-minutes backstop before Daytona auto-stops a sandbox. Teardown is the
+# primary cleanup path (``teardown()`` deletes the sandbox), but any path where
+# teardown does NOT run — an unhandled exception, the eval process being killed,
+# a cancelled CI job — would otherwise leak a sandbox that runs forever and
+# burns the compute budget (LCR-131). A non-zero auto-stop is the safety net:
+# an active trial polls Daytona every ~10s so it never goes idle, while an
+# orphaned sandbox stops after this many idle minutes. NOT zero — zero disables
+# the net entirely, which is the bug.
+AUTO_STOP_BACKSTOP_MINUTES = 30
+
 
 @dataclass
 class ExecuteResult:
@@ -24,16 +36,11 @@ class ExecuteResult:
     output: str
 
 
-@dataclass
-class ClaudeResult:
-    """Parsed result from a claude -p invocation."""
-
-    cost_usd: float = 0.0
-    num_turns: int = 0
-    duration_ms: int = 0
-    result_text: str = ""
-    is_error: bool = False
-    raw_jsonl: str = ""
+# Back-compat alias: the harness layer now owns the normalized agent-result
+# type. `_parse_claude_output` is re-exported from the Claude harness so callers
+# (and tests) that imported it from here keep working.
+ClaudeResult = AgentResult
+_parse_claude_output = parse_claude_output
 
 
 @dataclass
@@ -140,7 +147,9 @@ def create(self) -> None:
             image=image,
             labels=labels,
             env_vars=sandbox_env,
-            auto_stop_interval=0,  # disable auto-stop; sandbox is deleted in teardown
+            # Backstop against leaked sandboxes (see AUTO_STOP_BACKSTOP_MINUTES);
+            # teardown() still deletes the sandbox on the normal path.
+            auto_stop_interval=AUTO_STOP_BACKSTOP_MINUTES,
         )
 
         def _on_build_log(line: str) -> None:
@@ -243,31 +252,26 @@ def exec_claude(
         max_turns: int = 25,
         timeout: int = 600,
         model: str | None = None,
-    ) -> ClaudeResult:
-        """Run claude -p with the loop prompt and parse JSON output.
+    ) -> AgentResult:
+        """Run Claude Code headless against the staged loop prompt.
 
-        Uses Daytona's session API with async execution + polling to avoid
-        HTTP connection timeouts on long-running Claude Code invocations.
+        Thin back-compat wrapper over :class:`ClaudeHarness`. The multi-harness
+        trial loop calls ``harness.invoke(sandbox, ...)`` directly; this stays so
+        existing callers and tests keep working.
         """
-        assert self._sandbox is not None, "Call create() first"
-
-        model_flag = f"--model {shlex.quote(model)}" if model else ""
-        cmd = (
-            f"cd {self.WORK_DIR} && "
-            f"claude -p \"$(cat /tmp/loop-prompt.md)\" "
-            f"--output-format stream-json --verbose "
-            f"--dangerously-skip-permissions "
-            f"--max-turns {max_turns} "
-            f"{model_flag}"
-        ).strip()
-
-        start = time.monotonic()
-        result = self._exec_async_poll(cmd, timeout=timeout)
-        duration_ms = int((time.monotonic() - start) * 1000)
-
-        return _parse_claude_output(result.output, result.exit_code, duration_ms)
+        from lightcone.eval.harnesses.claude import ClaudeHarness
+
+        return ClaudeHarness().invoke(
+            self,
+            prompt_path="/tmp/loop-prompt.md",
+            work_dir=self.WORK_DIR,
+            max_turns=max_turns,
+            timeout=timeout,
+            with_skills=True,
+            model=model,
+        )
 
-    def _exec_async_poll(
+    def exec_async_poll(
         self, cmd: str, timeout: int = 600, poll_interval: int = 10
     ) -> ExecuteResult:
         """Execute a command asynchronously via Daytona sessions and poll for completion.
@@ -365,42 +369,3 @@ def _upload_directory(self, local_dir: Path, remote_dir: str) -> None:
                 rel = local_path.relative_to(local_dir)
                 remote_path = f"{remote_dir}/{rel}"
                 self.upload_file(remote_path, local_path.read_bytes())
-
-
-def _parse_claude_output(
-    raw_output: str, exit_code: int, duration_ms: int
-) -> ClaudeResult:
-    """Parse JSONL output from claude -p --output-format stream-json.
-
-    The stream-json format emits one JSON object per line. The final line
-    with ``{"type": "result", ...}`` contains the aggregate metrics.
-    """
-    result = ClaudeResult(duration_ms=duration_ms, raw_jsonl=raw_output)
-
-    if exit_code != 0:
-        result.is_error = True
-        result.result_text = raw_output
-        return result
-
-    for raw_line in reversed(raw_output.strip().splitlines()):
-        stripped = raw_line.strip()
-        if not stripped or not stripped.startswith("{"):
-            continue
-        try:
-            data = json.loads(stripped)
-            if data.get("type") == "result":
-                result.cost_usd = float(
-                    data.get("cost_usd", data.get("total_cost_usd", 0.0))
-                )
-                result.num_turns = int(data.get("num_turns", 0))
-                result.duration_ms = int(data.get("duration_ms", duration_ms))
-                result.result_text = str(data.get("result", ""))
-                result.is_error = bool(data.get("is_error", False))
-                return result
-        except (json.JSONDecodeError, ValueError):
-            continue
-
-    # No result line found
-    result.result_text = raw_output
-    result.is_error = True
-    return result
diff --git a/tests/test_eval_harness.py b/tests/test_eval_harness.py
index 2e37b066..dc6f389b 100644
--- a/tests/test_eval_harness.py
+++ b/tests/test_eval_harness.py
@@ -17,6 +17,7 @@
 )
 from lightcone.eval.models import (
     EvalRunConfig,
+    HarnessSpec,
     TaskSpec,
 )
 from lightcone.eval.sandbox import (
@@ -99,12 +100,15 @@ def test_custom_prompt(self, evals_dir: Path):
 
 class TestRunTrial:
     @patch("lightcone.eval.harness.EvalSandbox")
-    def test_successful_trial(self, mock_sandbox_cls: MagicMock, evals_dir: Path):
+    @patch("lightcone.eval.harness.get_harness")
+    def test_successful_trial(
+        self, mock_get_harness: MagicMock, mock_sandbox_cls: MagicMock, evals_dir: Path
+    ):
         """Test a trial that completes successfully."""
         sandbox_instance = mock_sandbox_cls.return_value
         sandbox_instance.WORK_DIR = "/home/user/project"
 
-        sandbox_instance.exec_claude.return_value = ClaudeResult(
+        mock_get_harness.return_value.invoke.return_value = ClaudeResult(
             cost_usd=0.05,
             num_turns=10,
             duration_ms=5000,
@@ -122,7 +126,8 @@ def test_successful_trial(self, mock_sandbox_cls: MagicMock, evals_dir: Path):
         config = EvalRunConfig(id="test-run")
 
         trial = run_trial(
-            task, 0, evals_dir=evals_dir, config=config, run_id="r1", wheels=[],
+            task, 0, harness_spec=HarnessSpec(name="claude"), with_skills=True,
+            evals_dir=evals_dir, config=config, run_id="r1", wheels=[],
         )
 
         assert trial.build_complete is True
@@ -141,7 +146,8 @@ def test_trial_with_error(self, mock_sandbox_cls: MagicMock, evals_dir: Path):
         config = EvalRunConfig(id="test-run")
 
         trial = run_trial(
-            task, 0, evals_dir=evals_dir, config=config, run_id="r1", wheels=[],
+            task, 0, harness_spec=HarnessSpec(name="claude"), with_skills=True,
+            evals_dir=evals_dir, config=config, run_id="r1", wheels=[],
         )
 
         assert trial.error is not None
@@ -149,12 +155,15 @@ def test_trial_with_error(self, mock_sandbox_cls: MagicMock, evals_dir: Path):
         sandbox_instance.teardown.assert_called_once()
 
     @patch("lightcone.eval.harness.EvalSandbox")
-    def test_trial_incomplete(self, mock_sandbox_cls: MagicMock, evals_dir: Path):
+    @patch("lightcone.eval.harness.get_harness")
+    def test_trial_incomplete(
+        self, mock_get_harness: MagicMock, mock_sandbox_cls: MagicMock, evals_dir: Path
+    ):
         """Test a trial where the build does not complete."""
         sandbox_instance = mock_sandbox_cls.return_value
         sandbox_instance.WORK_DIR = "/home/user/project"
 
-        sandbox_instance.exec_claude.return_value = ClaudeResult(
+        mock_get_harness.return_value.invoke.return_value = ClaudeResult(
             cost_usd=0.02,
             num_turns=5,
             duration_ms=3000,
@@ -171,7 +180,8 @@ def test_trial_incomplete(self, mock_sandbox_cls: MagicMock, evals_dir: Path):
         config = EvalRunConfig(id="test-run")
 
         trial = run_trial(
-            task, 0, evals_dir=evals_dir, config=config, run_id="r1", wheels=[],
+            task, 0, harness_spec=HarnessSpec(name="claude"), with_skills=True,
+            evals_dir=evals_dir, config=config, run_id="r1", wheels=[],
         )
 
         assert trial.build_complete is False
@@ -181,13 +191,20 @@ def test_trial_incomplete(self, mock_sandbox_cls: MagicMock, evals_dir: Path):
 
 class TestSidecarFiles:
     @patch("lightcone.eval.harness.EvalSandbox")
-    def test_sidecar_written(self, mock_sandbox_cls: MagicMock, evals_dir: Path, tmp_path: Path):
+    @patch("lightcone.eval.harness.get_harness")
+    def test_sidecar_written(
+        self,
+        mock_get_harness: MagicMock,
+        mock_sandbox_cls: MagicMock,
+        evals_dir: Path,
+        tmp_path: Path,
+    ):
         """Test that JSONL sidecar files are written when sidecar_dir is provided."""
         sandbox_instance = mock_sandbox_cls.return_value
         sandbox_instance.WORK_DIR = "/home/user/project"
 
         raw_jsonl = '{"type":"assistant","message":"hello"}\n{"type":"result","cost_usd":0.05}\n'
-        sandbox_instance.exec_claude.return_value = ClaudeResult(
+        mock_get_harness.return_value.invoke.return_value = ClaudeResult(
             cost_usd=0.05, num_turns=3, duration_ms=1000,
             result_text=BUILD_COMPLETE_MARKER, is_error=False,
             raw_jsonl=raw_jsonl,
@@ -202,7 +219,8 @@ def test_sidecar_written(self, mock_sandbox_cls: MagicMock, evals_dir: Path, tmp
 
         sidecar_dir = tmp_path / "logs"
         trial = run_trial(
-            task, 0, evals_dir=evals_dir, config=config,
+            task, 0, harness_spec=HarnessSpec(name="claude"), with_skills=True,
+            evals_dir=evals_dir, config=config,
             run_id="r1", wheels=[], sidecar_dir=sidecar_dir,
         )
 
@@ -212,12 +230,15 @@ def test_sidecar_written(self, mock_sandbox_cls: MagicMock, evals_dir: Path, tmp
         assert full_path.read_text() == raw_jsonl
 
     @patch("lightcone.eval.harness.EvalSandbox")
-    def test_no_sidecar_without_dir(self, mock_sandbox_cls: MagicMock, evals_dir: Path):
+    @patch("lightcone.eval.harness.get_harness")
+    def test_no_sidecar_without_dir(
+        self, mock_get_harness: MagicMock, mock_sandbox_cls: MagicMock, evals_dir: Path
+    ):
         """transcript_path stays None when no sidecar_dir is given."""
         sandbox_instance = mock_sandbox_cls.return_value
         sandbox_instance.WORK_DIR = "/home/user/project"
 
-        sandbox_instance.exec_claude.return_value = ClaudeResult(
+        mock_get_harness.return_value.invoke.return_value = ClaudeResult(
             cost_usd=0.01, num_turns=1, duration_ms=100,
             result_text=BUILD_COMPLETE_MARKER, is_error=False,
             raw_jsonl='{"type":"result"}\n',
@@ -231,7 +252,8 @@ def test_no_sidecar_without_dir(self, mock_sandbox_cls: MagicMock, evals_dir: Pa
         config = EvalRunConfig(id="test-run")
 
         trial = run_trial(
-            task, 0, evals_dir=evals_dir, config=config, run_id="r1", wheels=[],
+            task, 0, harness_spec=HarnessSpec(name="claude"), with_skills=True,
+            evals_dir=evals_dir, config=config, run_id="r1", wheels=[],
         )
         assert trial.iterations[0].transcript_path is None