diff --git a/evals/local-demo.yaml b/evals/local-demo.yaml new file mode 100644 index 00000000..3e6d6b4c --- /dev/null +++ b/evals/local-demo.yaml @@ -0,0 +1,24 @@ +# Full local demo: the with/without-Lightcone A/B across three harnesses on the +# snae task, run in local Docker. The matrix is {claude, codex, pi} x +# {with-skills, without-skills}. The report's "Δ lift" column is the headline — +# how much the Lightcone layer moved each harness's score. +# +# Fair, comparable model tier across harnesses (codex Spark was retired): codex +# and pi both run gpt-5.4-mini, claude runs haiku. pi reaches gpt-5.4-mini via +# Cail's GitHub Copilot subscription using pi's native `provider/model` string. +# claude needs CLAUDE_CODE_OAUTH_TOKEN in the host env (loaded from .env). +id: local-demo +backend: local_docker +tasks: + - snae +harnesses: + - { name: claude, model: haiku } + - { name: codex, model: gpt-5.4-mini } + - { name: pi, model: github-copilot/gpt-5.4-mini } +skill_variants: [true, false] +num_trials: 1 +max_concurrency: 3 +# No turn cap — agents run to completion (claude uses the task's max_turns=200). +# trial_timeout is just a safety ceiling against a hung agent, not a turn cap. +trial_timeout: 1800 +output_dir: eval-results diff --git a/evals/local-smoke.yaml b/evals/local-smoke.yaml new file mode 100644 index 00000000..5de8266c --- /dev/null +++ b/evals/local-smoke.yaml @@ -0,0 +1,22 @@ +# Cheap plumbing smoke for the local-Docker multi-harness path. `max_turns` +# caps each trial so it exits fast: the build won't complete (graders score +# low), but the whole path — container build, auth copy-in, headless invoke, +# output parse, grading, teardown, scorecard — is exercised end to end across +# all three harnesses. Run this first to shake out orchestration/auth bugs +# before spending on a full build. +id: local-smoke +backend: local_docker +tasks: + - snae +harnesses: + - claude + - codex + - pi +skill_variants: [true] +num_trials: 1 +max_concurrency: 1 +# max_turns only bounds claude; codex and pi have no max-turns flag, so a SHORT +# trial_timeout is what keeps a smoke cheap for them. +max_turns: 5 +trial_timeout: 180 +output_dir: eval-results diff --git a/evals/tasks/snae/astra.yaml b/evals/tasks/snae/astra.yaml index 464e8122..c1b5f121 100644 --- a/evals/tasks/snae/astra.yaml +++ b/evals/tasks/snae/astra.yaml @@ -1,13 +1,31 @@ # ASTRA Analysis Specification # Documentation: https://github.com/LightconeResearch/ASTRA +id: snae version: "1.0" name: "snae" -description: | - Fit the Union2.1 Type Ia supernova distance modulus vs redshift data - to a flat LCDM cosmological model with two free parameters (H0, Omega_L) - using maximum-likelihood (MAP) point estimation. This provides best-fit - cosmological parameters as a building block for a larger analysis. + +narrative: + summary: | + Fit the Union2.1 Type Ia supernova distance modulus vs redshift data + to a flat LCDM cosmological model with two free parameters (H0, Omega_L) + using maximum-likelihood (MAP) point estimation. This provides best-fit + cosmological parameters as a building block for a larger analysis. + inputs: | + The single input is the [Union2.1 compilation](#inputs.union21): 580 + Type Ia supernovae with redshift, distance modulus, and uncertainties. + methods: | + The fit minimizes a chi-squared between the observed distance moduli and + the flat-LCDM prediction, varying H0 and Omega_L. Three decisions shape + the fit: the [optimizer](#decisions.optimizer) used for the minimization, + the [error model](#decisions.error_model) (statistical-only vs. + statistical+systematic uncertainties), and a [low-redshift cut](#decisions.redshift_cut) + that optionally removes peculiar-velocity-dominated supernovae. + outputs: | + Three outputs: the [best-fit parameters](#outputs.best_fit) (H0, Omega_L, + reduced chi-squared), a [Hubble diagram](#outputs.hubble_diagram) with the + best-fit model overlaid on the data, and a [residuals plot](#outputs.residuals) + of data minus model versus redshift. container: Containerfile @@ -21,22 +39,38 @@ outputs: - id: best_fit type: metric description: "Best-fit H0 and Omega_L from chi-squared minimization, with reduced chi-squared" + inputs: [union21] + decisions: [optimizer, error_model, redshift_cut] recipe: - command: python scripts/fit.py + command: >- + python scripts/fit.py + --union21 {inputs.union21} + --optimizer {decisions.optimizer} + --error-model {decisions.error_model} + --redshift-cut {decisions.redshift_cut} + --out {output} - id: hubble_diagram type: figure description: "Hubble diagram: distance modulus vs redshift with best-fit model overlay" + inputs: [union21, best_fit] recipe: - command: python scripts/plot_hubble.py - inputs: [best_fit] + command: >- + python scripts/plot_hubble.py + --union21 {inputs.union21} + --best-fit {inputs.best_fit} + --out {output} - id: residuals type: figure description: "Residuals plot: data minus best-fit model vs redshift" + inputs: [union21, best_fit] recipe: - command: python scripts/plot_residuals.py - inputs: [best_fit] + command: >- + python scripts/plot_residuals.py + --union21 {inputs.union21} + --best-fit {inputs.best_fit} + --out {output} decisions: optimizer: diff --git a/src/lightcone/eval/backends/__init__.py b/src/lightcone/eval/backends/__init__.py new file mode 100644 index 00000000..eb9b64b2 --- /dev/null +++ b/src/lightcone/eval/backends/__init__.py @@ -0,0 +1,21 @@ +"""Sandbox backends for the eval harness. + +A backend is the execution substrate one trial runs inside. All backends share +the :class:`Sandbox` surface, so a harness drives any of them unchanged. + + - :class:`LocalDockerSandbox` — a local Docker container per trial. The + counterpart of the Daytona :class:`lightcone.eval.sandbox.EvalSandbox`, for + running the suite on a developer/CI host with Docker rather than a Daytona + account. +""" + +from __future__ import annotations + +from lightcone.eval.backends.base import ExecuteResult, Sandbox +from lightcone.eval.backends.local_docker import LocalDockerSandbox + +__all__ = [ + "ExecuteResult", + "LocalDockerSandbox", + "Sandbox", +] diff --git a/src/lightcone/eval/backends/base.py b/src/lightcone/eval/backends/base.py new file mode 100644 index 00000000..f1f18b87 --- /dev/null +++ b/src/lightcone/eval/backends/base.py @@ -0,0 +1,72 @@ +"""Sandbox backend abstraction for eval trials. + +A ``Sandbox`` is the execution substrate one eval trial runs inside. It mirrors +the public surface of the original :class:`lightcone.eval.sandbox.EvalSandbox` +(the Daytona backend) so harnesses — which depend only on the +``SandboxLike`` protocol (``WORK_DIR``, ``exec``, ``exec_async_poll``, +``upload_file``) — drive any backend unchanged. + +Backends: + - ``EvalSandbox`` (sandbox.py) — ephemeral Daytona cloud sandbox + - ``LocalDockerSandbox`` (this pkg) — a local Docker container per trial +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from pathlib import Path + + +@dataclass +class ExecuteResult: + """Result from running a command in a sandbox.""" + + exit_code: int + output: str + + +class Sandbox(ABC): + """One ephemeral execution substrate for a single eval trial. + + The lifecycle is ``create() → setup() → (exec/exec_async_poll/upload_file)* → + teardown()``. Subclasses provide the concrete substrate (Daytona cloud + sandbox, local Docker container, …); the abstract surface here is exactly + what the trial loop and the harness layer consume. + """ + + #: Project root inside the sandbox — where ``lc init`` scaffolds and the + #: agent runs. Mirrors ``EvalSandbox.WORK_DIR``. + WORK_DIR = "/home/evaluser/project" + + @abstractmethod + def create(self) -> None: + """Provision the substrate (build image if needed, start the sandbox).""" + + @abstractmethod + def setup( + self, + seed_dir: Path, + universe: str, + loop_prompt_template: str, + wheels: list[Path] | None = None, + ) -> None: + """Scaffold the project via ``lc init`` and overlay task seed files.""" + + @abstractmethod + def exec(self, cmd: str, timeout: int = 300, cwd: str | None = None) -> ExecuteResult: + """Run a command in the sandbox, returning its exit code and output.""" + + @abstractmethod + def exec_async_poll( + self, cmd: str, timeout: int = 600, poll_interval: int = 10 + ) -> ExecuteResult: + """Run a long-running command, tolerant of gateway timeouts.""" + + @abstractmethod + def upload_file(self, remote_path: str, content: bytes) -> None: + """Upload a file into the sandbox at ``remote_path``.""" + + @abstractmethod + def teardown(self) -> None: + """Destroy the substrate. Idempotent.""" diff --git a/src/lightcone/eval/backends/local_docker.py b/src/lightcone/eval/backends/local_docker.py new file mode 100644 index 00000000..4feffb0b --- /dev/null +++ b/src/lightcone/eval/backends/local_docker.py @@ -0,0 +1,378 @@ +"""Local-Docker sandbox backend for the eval harness. + +Runs each trial in a local Docker container — the on-host/CI counterpart of the +Daytona :class:`lightcone.eval.sandbox.EvalSandbox`. Drives the ``docker`` CLI +via ``subprocess`` (no docker SDK), and mirrors ``EvalSandbox``'s public surface +and ``setup()`` body so harnesses run unchanged. + +The image (``lc-eval-local:latest``) bakes in every registered harness's agent +CLI plus the third-party Python deps the Daytona image pre-installs; it is built +once and cached by tag. Auth credentials are copied in per-container (``docker +cp`` + ``chown``) rather than bind-mounted, because the host credential files are +``0600`` and host-uid-owned and would not be readable by the container's +``evaluser``. +""" + +from __future__ import annotations + +import logging +import os +import re +import shlex +import subprocess +import tempfile +from pathlib import Path + +from lightcone.eval.backends.base import ExecuteResult, Sandbox +from lightcone.eval.harnesses import available_harnesses, get_harness +from lightcone.eval.harnesses.base import Harness + +logger = logging.getLogger(__name__) + +#: Shared image tag — built once, reused across trials. +IMAGE_TAG = "lc-eval-local:latest" + +#: Third-party Python deps the Daytona image pre-installs system-wide. Kept +#: verbatim in sync with ``EvalSandbox.create``'s ``deps`` so the local image +#: resolves ``lc`` / ``astra`` identically (the lightcone-cli wheel is installed +#: ``--no-deps`` at setup time, so every runtime dep it needs is listed here). +DEPS = ( + "astra-tools astra-spec" + " jinja2 jsonschema" + " snakemake snakemake-interface-executor-plugins" + " snakemake-interface-common dask distributed" +) + +#: Per-harness host auth files to copy into the container, keyed by harness name. +#: Each entry is ``(host_path, container_path)``; host paths may use ``~``. +#: Missing host files are skipped (a harness without credentials is *skipped*, +#: not failed — mirrors ``credential_env_keys`` semantics). +CREDENTIAL_FILES: dict[str, list[tuple[str, str]]] = { + # claude authenticates purely via CLAUDE_CODE_OAUTH_TOKEN (forwarded env) on + # top of the image's onboarding file. We deliberately do NOT copy the host + # ~/.claude.json: on macOS it lacks the token anyway (keychain holds it), and + # it drags Cail's MCP/project state into every trial container. + "claude": [], + "codex": [("~/.codex/auth.json", "/home/evaluser/.codex/auth.json")], + "pi": [ + ("~/.pi/agent/auth.json", "/home/evaluser/.pi/agent/auth.json"), + ("~/.pi/agent/models.json", "/home/evaluser/.pi/agent/models.json"), + ("~/.pi/agent/settings.json", "/home/evaluser/.pi/agent/settings.json"), + ], +} + + +def _sanitize(name: str) -> str: + """Coerce ``name`` to the docker container-name charset ``[a-zA-Z0-9_.-]``.""" + return re.sub(r"[^a-zA-Z0-9_.-]", "-", name) + + +def _install_lines() -> list[str]: + """One ``RUN`` line per registered harness's install commands, adapted so the + agent CLI lands on ``PATH`` for ``evaluser``. + + Two install shapes exist among the harnesses: + + - **npm globals** (codex, pi): run as root (the default ``RUN`` user) so the + binary lands in ``/usr/local/bin`` — on ``PATH`` for every user. + - **Claude's curl installer**, whose command ends ``&& cp /root/.local/bin/ + claude /usr/local/bin/claude`` (assuming a root install): the ``cp`` source + doesn't exist when run as ``evaluser``. We rewrite it to install as + ``evaluser`` (so its config lands under ``/home/evaluser``) and symlink from + ``/home/evaluser/.local/bin`` into ``/usr/local/bin`` as root. + """ + lines: list[str] = [] + for hname in available_harnesses(): + for cmd in get_harness(hname).install_commands(): + # Claude idiom: ` && cp /root/.local/bin/ /usr/local/bin/`. + m = re.search(r"cp\s+/root/\.local/bin/(\S+)\s+/usr/local/bin/\S+", cmd) + if m: + binary = m.group(1) + installer = cmd[: m.start()].rstrip(" &") + lines.append(f"RUN su - evaluser -c {shlex.quote(installer)}") + lines.append( + f"RUN ln -sf /home/evaluser/.local/bin/{binary} /usr/local/bin/{binary}" + ) + else: + # npm -g and friends: root install → /usr/local/bin. + lines.append(f"RUN {cmd}") + return lines + + +def _build_dockerfile() -> str: + """Generate the Dockerfile for the shared eval image (all harnesses baked in).""" + install = "\n".join(_install_lines()) + # Node 22 (NodeSource) — pi (`@earendil-works/pi-coding-agent`) needs + # node >=22.19.0; Debian slim's `nodejs` apt package is v20 and crashes pi at + # runtime (`webidl.util.markAsUncloneable is not a function`). codex and + # claude are unaffected, but we install one modern Node for all npm globals. + return f"""FROM python:3.12-slim +RUN apt-get update && apt-get install -y git curl bash sudo jq \\ + && curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \\ + && apt-get install -y nodejs \\ + && rm -rf /var/lib/apt/lists/* +RUN useradd -m -s /bin/bash evaluser \\ + && echo 'evaluser ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers +RUN pip install --no-cache-dir {DEPS} +{install} +RUN mkdir -p /home/evaluser/.claude \\ + && echo '{{"hasCompletedOnboarding": true}}' > /home/evaluser/.claude.json \\ + && chown -R evaluser:evaluser /home/evaluser/.claude /home/evaluser/.claude.json +""" + + +class LocalDockerSandbox(Sandbox): + """Runs one eval trial inside a local Docker container. + + The container is started detached (``sleep infinity``) and each ``exec`` is a + fresh ``docker exec``; ``teardown`` removes it. The image is shared across + trials and built once (cached by tag). + """ + + def __init__( + self, + task_id: str, + trial_id: str, + harness: Harness, + env_vars: dict[str, str], + ) -> None: + self.task_id = task_id + self.trial_id = trial_id + self.harness = harness + self.env_vars = env_vars + self.cname = _sanitize(f"lc-eval-{trial_id}") + + # ------------------------------------------------------------------ lifecycle + + def create(self) -> None: + """Build the shared image (if not cached) and start the trial container.""" + self._ensure_image() + + env_args: list[str] = [] + for key, val in self.env_vars.items(): + env_args += ["-e", f"{key}={val}"] + # Forward host credentials this harness declares, when set in the env. + for key in self.harness.credential_env_keys(): + val = os.environ.get(key) + if val: + env_args += ["-e", f"{key}={val}"] + + self._run( + [ + "docker", "run", "-d", + "--name", self.cname, + "-u", "evaluser", + *env_args, + "-w", "/home/evaluser", + IMAGE_TAG, + "sleep", "infinity", + ] + ) + logger.info("Started container %s for trial %s", self.cname, self.trial_id) + + self._copy_credentials() + + def _ensure_image(self) -> None: + """Build ``lc-eval-local:latest`` unless already present (tag cache).""" + present = subprocess.run( + ["docker", "image", "inspect", IMAGE_TAG], + capture_output=True, + ) + if present.returncode == 0: + logger.info("Reusing cached eval image %s", IMAGE_TAG) + return + + dockerfile = _build_dockerfile() + logger.info("Building eval image %s …", IMAGE_TAG) + proc = subprocess.run( + ["docker", "build", "-t", IMAGE_TAG, "-f", "-", "."], + input=dockerfile, + text=True, + capture_output=True, + ) + for line in (proc.stdout + proc.stderr).splitlines(): + logger.info("[image build] %s", line.rstrip()) + if proc.returncode != 0: + raise RuntimeError(f"Failed to build {IMAGE_TAG} (exit {proc.returncode})") + logger.info("Built eval image %s", IMAGE_TAG) + + def _copy_credentials(self) -> None: + """Copy host auth files into the container (``docker cp`` + ``chown``). + + Bind mounts won't work: host credential files are ``0600`` and + host-uid-owned, unreadable by the container's ``evaluser``. So we copy + them in and re-own to ``evaluser``. Missing host files are skipped. + """ + for host_raw, container_path in CREDENTIAL_FILES.get(self.harness.name, []): + host_path = Path(host_raw).expanduser() + if not host_path.is_file(): + continue + parent = os.path.dirname(container_path) + self._run(["docker", "exec", "-u", "evaluser", self.cname, "mkdir", "-p", parent]) + self._run(["docker", "cp", str(host_path), f"{self.cname}:{container_path}"]) + self._run( + ["docker", "exec", "-u", "root", self.cname, + "chown", "-R", "evaluser:evaluser", container_path] + ) + + def teardown(self) -> None: + """Force-remove the container. Idempotent (errors ignored).""" + subprocess.run( + ["docker", "rm", "-f", self.cname], + capture_output=True, + ) + + # ------------------------------------------------------------------ exec / io + + def exec(self, cmd: str, timeout: int = 300, cwd: str | None = None) -> ExecuteResult: + """Run ``cmd`` as ``evaluser`` via ``docker exec ... bash -lc``.""" + full = f"cd {shlex.quote(cwd)} && {cmd}" if cwd else cmd + try: + proc = subprocess.run( + ["docker", "exec", "-u", "evaluser", self.cname, "bash", "-lc", full], + capture_output=True, + text=True, + timeout=timeout, + ) + except subprocess.TimeoutExpired: + return ExecuteResult(124, f"Timed out after {timeout}s") + return ExecuteResult(proc.returncode, proc.stdout + proc.stderr) + + def exec_async_poll( + self, cmd: str, timeout: int = 600, poll_interval: int = 10 + ) -> ExecuteResult: + """Run a long command. Locally a blocking ``docker exec`` is fine — there + is no gateway timeout to drop the connection — so delegate to ``exec``.""" + return self.exec(cmd, timeout=timeout) + + def upload_file(self, remote_path: str, content: bytes) -> None: + """Write ``content`` to ``remote_path`` inside the container.""" + with tempfile.NamedTemporaryFile(delete=False) as tmp: + tmp.write(content) + tmp_path = tmp.name + try: + parent = os.path.dirname(remote_path) + if parent: + self._run( + ["docker", "exec", "-u", "evaluser", self.cname, "mkdir", "-p", parent] + ) + self._run(["docker", "cp", tmp_path, f"{self.cname}:{remote_path}"]) + self._run( + ["docker", "exec", "-u", "root", self.cname, + "chown", "evaluser:evaluser", remote_path] + ) + finally: + os.unlink(tmp_path) + + # ------------------------------------------------------------------ setup + + def setup( + self, + seed_dir: Path, + universe: str, + loop_prompt_template: str, + wheels: list[Path] | None = None, + ) -> None: + """Scaffold the project via ``lc init`` and overlay task seed files. + + Faithful to :meth:`EvalSandbox.setup`: write the global config, run + ``lc init --no-git --no-venv``, overlay the task seed dir, regenerate the + baseline universe, stage the loop prompt, then ``git init`` + seed commit. + Wheels are installed system-wide (via ``sudo``) so ``lc`` / ``astra`` + resolve before they're invoked. + """ + if wheels: + self._install_wheels(wheels) + + # Pin the global config explicitly for reproducibility (lc would + # auto-create it, but writing it makes the runtime deterministic). + self.exec( + "mkdir -p ~/.lightcone" + " && printf 'container:\\n runtime: auto\\n' > ~/.lightcone/config.yaml" + ) + + # Scaffold from the wheel under test. --no-venv: deps are system-wide. + # --no-git: we git init below, after the overlay, so the seed commit + # captures the task files too. + result = self.exec( + f"mkdir -p {self.WORK_DIR} && lc init {self.WORK_DIR} --no-git --no-venv", + timeout=120, + ) + if result.exit_code != 0: + raise RuntimeError( + f"`lc init` failed (exit {result.exit_code}):\n{result.output[-2000:]}" + ) + + # Overlay task seed files (astra.yaml, data/, task.yaml). + self._upload_directory(seed_dir, self.WORK_DIR) + + # Regenerate baseline.yaml from the task astra.yaml's defaults (the one + # lc init produced matches the boilerplate astra.yaml, now replaced). + self.exec( + f"cd {self.WORK_DIR}" + f" && rm -f universes/baseline.yaml" + f" && astra universe generate -n baseline" + f" -d 'Default configuration using standard practices'", + timeout=60, + ) + + # Template and stage the loop prompt for the agent. + prompt = loop_prompt_template.replace("{{UNIVERSE}}", universe) + self.upload_file("/tmp/loop-prompt.md", prompt.encode()) + + # Initial commit captures the full project state. + self.exec( + f"cd {self.WORK_DIR}" + " && git config --global user.name Eval" + " && git config --global user.email eval@lightcone" + " && git init -q && git add -A && git commit -q -m 'seed'" + ) + + def _install_wheels(self, wheels: list[Path]) -> None: + """Upload the lightcone-cli wheel(s) and install them system-wide. + + ``--no-deps`` (every runtime dep is baked into the image) and + ``--force-reinstall`` (the local wheel always overrides any PyPI version). + ``sudo`` so the install is system-wide and ``lc`` / ``astra`` resolve for + ``evaluser`` — mirroring the Daytona image's root-installed deps. + """ + self.exec("mkdir -p /tmp/deps") + + remote_paths: list[str] = [] + for whl in wheels: + remote_path = f"/tmp/deps/{whl.name}" + self.upload_file(remote_path, whl.read_bytes()) + remote_paths.append(remote_path) + + whl_cmd = ( + "sudo pip install --no-deps --force-reinstall " + + " ".join(shlex.quote(p) for p in remote_paths) + ) + result = self.exec(whl_cmd, timeout=120) + if result.exit_code != 0: + logger.warning( + "Failed to install wheels (exit %d):\n...%s", + result.exit_code, + result.output[-2000:], + ) + else: + logger.info("Installed wheels: %s", [w.name for w in wheels]) + + def _upload_directory(self, local_dir: Path, remote_dir: str) -> None: + """Upload a local directory tree into the container.""" + for local_path in local_dir.rglob("*"): + if local_path.is_file(): + rel = local_path.relative_to(local_dir) + self.upload_file(f"{remote_dir}/{rel}", local_path.read_bytes()) + + # ------------------------------------------------------------------ helpers + + def _run(self, args: list[str]) -> subprocess.CompletedProcess[str]: + """Run a docker CLI command, raising with captured output on failure.""" + proc = subprocess.run(args, capture_output=True, text=True) + if proc.returncode != 0: + raise RuntimeError( + f"`{' '.join(args)}` failed (exit {proc.returncode}):\n" + f"{(proc.stdout + proc.stderr)[-2000:]}" + ) + return proc diff --git a/src/lightcone/eval/cli.py b/src/lightcone/eval/cli.py index f1a253a1..c49c097b 100644 --- a/src/lightcone/eval/cli.py +++ b/src/lightcone/eval/cli.py @@ -53,7 +53,12 @@ def run_cmd( lc eval run evals/example-run.yaml --num-trials 1 --concurrency 2 """ from lightcone.eval.harness import load_run_config, run_eval - from lightcone.eval.report import compute_summary, print_comparison_table, save_results + from lightcone.eval.report import ( + compute_summary, + print_comparison_table, + print_matrix_table, + save_results, + ) config = load_run_config(config_path) @@ -99,7 +104,10 @@ def _on_trial_complete(trial: object) -> None: if dry_run: schedule = eval_run.summary.get("schedule", []) for s in schedule: - console.print(f" {s['task']} trial {s['trial']}") + skills = "skills" if s.get("skills") else "bare" + console.print( + f" {s['task']} · {s.get('harness', '?')} · {skills} · trial {s['trial']}" + ) console.print(f"\n[bold]Total: {eval_run.summary.get('total_trials', 0)} trials[/bold]") return @@ -117,6 +125,8 @@ def _on_trial_complete(trial: object) -> None: eval_run.summary = compute_summary(eval_run) console.print() print_comparison_table(eval_run, console=console) + console.print() + print_matrix_table(eval_run, console=console) # Save results output_path = save_results(eval_run, config.output_dir) diff --git a/src/lightcone/eval/harness.py b/src/lightcone/eval/harness.py index 138b8220..fae29616 100644 --- a/src/lightcone/eval/harness.py +++ b/src/lightcone/eval/harness.py @@ -17,15 +17,44 @@ from lightcone.eval.build import build_eval_wheels from lightcone.eval.graders import compute_composite_score, run_graders +from lightcone.eval.harnesses import Harness, get_harness from lightcone.eval.models import ( EvalRun, EvalRunConfig, + HarnessSpec, IterationResult, TaskSpec, TrialResult, ) from lightcone.eval.sandbox import BUILD_COMPLETE_MARKER, EvalSandbox + +def _make_sandbox( + config: EvalRunConfig, + task: TaskSpec, + trial_id: str, + harness: Harness, + env_vars: dict[str, str], +) -> Any: + """Construct the trial's sandbox for the configured backend. + + ``local_docker`` runs the agent in a throwaway local container (the default + for local demos); ``daytona`` is the original cloud-sandbox path used in CI. + Both satisfy the ``SandboxLike`` protocol the harness drives. + """ + if config.backend == "local_docker": + from lightcone.eval.backends.local_docker import LocalDockerSandbox + + return LocalDockerSandbox( + task_id=task.id, trial_id=trial_id, harness=harness, env_vars=env_vars + ) + return EvalSandbox( + task_id=task.id, + trial_id=trial_id, + sandbox_image=config.sandbox_image, + env_vars=env_vars, + ) + logger = logging.getLogger(__name__) DEFAULT_LOOP_PROMPT = """\ @@ -83,32 +112,38 @@ def run_trial( task: TaskSpec, trial_number: int, *, + harness_spec: HarnessSpec, + with_skills: bool, evals_dir: Path, config: EvalRunConfig, run_id: str, wheels: list[Path], sidecar_dir: Path | None = None, ) -> TrialResult: - """Run a single trial: create sandbox -> run the build prompt -> grade -> teardown.""" - trial_id = f"{run_id}-{task.id}-{trial_number}" + """Run one trial: create sandbox -> prepare -> run the agent -> grade -> teardown.""" + variant = "skills" if with_skills else "bare" + trial_id = f"{run_id}-{task.id}-{harness_spec.name}-{variant}-{trial_number}" trial = TrialResult( trial_id=trial_id, task_id=task.id, + harness=harness_spec.name, + with_skills=with_skills, trial_number=trial_number, started_at=datetime.now(UTC), ) + harness = get_harness(harness_spec.name) + # The local-Docker backend only forwards what we pass here (it doesn't + # inject eval metadata the way EvalSandbox does), so set it explicitly. env_vars = { + "LIGHTCONE_EVAL": "true", "LIGHTCONE_EVAL_RUN_ID": run_id, + "LIGHTCONE_EVAL_TRIAL_ID": trial_id, + "LIGHTCONE_EVAL_TASK_ID": task.id, "CLAUDE_CODE_SESSION_ID": f"eval-{trial_id}", } - sandbox = EvalSandbox( - task_id=task.id, - trial_id=trial_id, - sandbox_image=config.sandbox_image, - env_vars=env_vars, - ) + sandbox = _make_sandbox(config, task, trial_id, harness, env_vars) try: sandbox.create() @@ -123,13 +158,22 @@ def run_trial( wheels=wheels, ) + # With/without-Lightcone A/B: prepare strips the scaffold for the bare + # arm (or no-ops for harnesses that gate skills via invoke flags). + harness.prepare(sandbox, work_dir=sandbox.WORK_DIR, with_skills=with_skills) + # Single invocation with a high max-turns budget — the prompt is # self-contained and the agent loops over outputs internally. start = time.monotonic() try: - claude_result = sandbox.exec_claude( - max_turns=task.max_turns, - timeout=task.trial_timeout, + claude_result = harness.invoke( + sandbox, + prompt_path="/tmp/loop-prompt.md", + work_dir=sandbox.WORK_DIR, + max_turns=config.max_turns or task.max_turns, + timeout=config.trial_timeout or task.trial_timeout, + with_skills=with_skills, + model=harness_spec.model, ) duration = time.monotonic() - start @@ -198,11 +242,20 @@ def run_eval( # Load all tasks tasks = [load_task(evals_dir, tid) for tid in config.tasks] - # Build trial schedule + # Build trial schedule: tasks x harnesses x skill_variants x num_trials schedule: list[dict[str, Any]] = [] for task in tasks: - for n in range(config.num_trials): - schedule.append({"task": task, "trial_number": n}) + for hspec in config.harnesses: + for with_skills in config.skill_variants: + for n in range(config.num_trials): + schedule.append( + { + "task": task, + "harness_spec": hspec, + "with_skills": with_skills, + "trial_number": n, + } + ) if dry_run: return EvalRun( @@ -210,7 +263,12 @@ def run_eval( started_at=datetime.now(UTC), finished_at=datetime.now(UTC), summary={"dry_run": True, "total_trials": len(schedule), "schedule": [ - {"task": s["task"].id, "trial": s["trial_number"]} + { + "task": s["task"].id, + "harness": s["harness_spec"].name, + "skills": s["with_skills"], + "trial": s["trial_number"], + } for s in schedule ]}, ) @@ -252,6 +310,8 @@ def _signal_handler(signum: int, frame: Any) -> None: run_trial, s["task"], s["trial_number"], + harness_spec=s["harness_spec"], + with_skills=s["with_skills"], evals_dir=evals_dir, config=config, run_id=run_id, @@ -272,6 +332,8 @@ def _signal_handler(signum: int, frame: Any) -> None: trial = TrialResult( trial_id=f"{run_id}-error", task_id=s["task"].id, + harness=s["harness_spec"].name, + with_skills=s["with_skills"], trial_number=s["trial_number"], error=str(exc), ) diff --git a/src/lightcone/eval/harnesses/__init__.py b/src/lightcone/eval/harnesses/__init__.py new file mode 100644 index 00000000..dc2623eb --- /dev/null +++ b/src/lightcone/eval/harnesses/__init__.py @@ -0,0 +1,50 @@ +"""Harness registry — maps a harness id to its :class:`Harness` implementation. + +To add a harness: create ``harnesses/.py`` with a ``Harness`` subclass, +import it here, and add it to ``_REGISTRY``. The eval run config selects +harnesses by id (``harnesses: [claude, codex, pi]``). +""" + +from __future__ import annotations + +from lightcone.eval.harnesses.base import ( + AgentResult, + CommandResult, + Harness, + SandboxLike, +) +from lightcone.eval.harnesses.claude import ClaudeHarness +from lightcone.eval.harnesses.codex import CodexHarness +from lightcone.eval.harnesses.pi import PiHarness + +_REGISTRY: dict[str, type[Harness]] = { + ClaudeHarness.name: ClaudeHarness, + CodexHarness.name: CodexHarness, + PiHarness.name: PiHarness, +} + + +def get_harness(name: str) -> Harness: + """Instantiate the harness registered under ``name``.""" + try: + return _REGISTRY[name]() + except KeyError: + raise ValueError( + f"Unknown harness {name!r}; known harnesses: {sorted(_REGISTRY)}" + ) from None + + +def available_harnesses() -> list[str]: + """Sorted list of registered harness ids.""" + return sorted(_REGISTRY) + + +__all__ = [ + "AgentResult", + "ClaudeHarness", + "CommandResult", + "Harness", + "SandboxLike", + "available_harnesses", + "get_harness", +] diff --git a/src/lightcone/eval/harnesses/base.py b/src/lightcone/eval/harnesses/base.py new file mode 100644 index 00000000..5eeda421 --- /dev/null +++ b/src/lightcone/eval/harnesses/base.py @@ -0,0 +1,131 @@ +"""Harness abstraction: one agent CLI driven headlessly inside a Sandbox. + +A ``Harness`` is the single seam where the eval becomes agent-specific. Before +this layer, ``EvalSandbox.exec_claude`` hardwired Claude Code into the trial +loop. A harness declares four things and nothing more: + +1. how to **install** its agent CLI into the eval image, +2. which host **credentials** to forward for auth, +3. optional per-trial project **prepare** (e.g. strip the Lightcone scaffold for + the without-skills arm of the A/B), and +4. how to **invoke** the agent headlessly, returning a parsed ``AgentResult``. + +Everything downstream stays harness-agnostic: graders read the materialized +filesystem (``lc status --json``, ``astra validate``), never the agent. The +concrete per-harness commands live in the ``harness-invocation-matrix`` fiber. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Protocol, runtime_checkable + + +@dataclass +class AgentResult: + """Parsed result of one headless agent invocation, normalized across + harnesses. + + ``cost_usd`` is best-effort: harnesses that do not price a run (e.g. codex) + report ``0.0``. ``raw_jsonl`` keeps the agent's full stdout for the trial + transcript sidecar. + """ + + cost_usd: float = 0.0 + num_turns: int = 0 + duration_ms: int = 0 + result_text: str = "" + is_error: bool = False + raw_jsonl: str = "" + + +@runtime_checkable +class CommandResult(Protocol): + """Anything with an exit code and captured output — e.g. the sandbox + backends' ``ExecuteResult``.""" + + exit_code: int + output: str + + +@runtime_checkable +class SandboxLike(Protocol): + """The execution substrate a ``Harness`` drives. + + Both ``DaytonaSandbox`` and ``LocalDockerSandbox`` satisfy this. A harness + uses only these primitives, so it never depends on a specific backend. + """ + + WORK_DIR: str + + def exec(self, cmd: str, timeout: int = 300, cwd: str | None = None) -> CommandResult: ... + + def exec_async_poll( + self, cmd: str, timeout: int = 600, poll_interval: int = 10 + ) -> CommandResult: ... + + def upload_file(self, remote_path: str, content: bytes) -> None: ... + + +class Harness(ABC): + """One agent CLI, driven headlessly inside a sandbox. + + Subclasses set ``name`` (the registry id) and implement install / + credentials / invoke. ``prepare`` has a sensible default — strip the + Lightcone scaffold for the without-skills arm — that file-scaffold harnesses + (Claude) inherit and flag-based harnesses (pi) may override. + """ + + #: registry id, e.g. "claude" / "codex" / "pi" + name: str = "" + + @abstractmethod + def install_commands(self) -> list[str]: + """Shell commands (run as root at image-build time) that install the + agent CLI onto ``PATH``. Consumed by every sandbox backend's image + build so the binary is present before a trial starts.""" + + @abstractmethod + def credential_env_keys(self) -> list[str]: + """Host env var names to forward into the sandbox for auth. Missing keys + are skipped — a harness with no credentials is reported *skipped*, not + failed.""" + + @abstractmethod + def invoke( + self, + sandbox: SandboxLike, + *, + prompt_path: str, + work_dir: str, + max_turns: int, + timeout: int, + with_skills: bool, + model: str | None = None, + ) -> AgentResult: + """Run the agent headlessly against the project in ``work_dir``, reading + the loop prompt from ``prompt_path`` inside the sandbox. + + ``with_skills`` lets harnesses with native skill flags (pi's + ``--skill`` / ``--no-skills``) toggle skill loading at invoke time; + file-scaffold harnesses rely on ``prepare`` having stripped the scaffold + instead, and can ignore it. + """ + + def prepare( + self, sandbox: SandboxLike, *, work_dir: str, with_skills: bool + ) -> None: + """Per-trial project prep before invocation. + + Default: for the without-skills arm, strip the Lightcone scaffold so the + agent runs bare (``.claude/`` skills+hooks, ``CLAUDE.md``, and any + sibling agent-context files). The ``lc`` engine is intentionally left in + place — the A/B isolates the *guidance layer*, not the execution + substrate. Harnesses that gate skills via invoke flags may override. + """ + if not with_skills: + sandbox.exec( + f"cd {work_dir} && rm -rf .claude CLAUDE.md AGENTS.md GEMINI.md", + timeout=30, + ) diff --git a/src/lightcone/eval/harnesses/claude.py b/src/lightcone/eval/harnesses/claude.py new file mode 100644 index 00000000..ab4ade52 --- /dev/null +++ b/src/lightcone/eval/harnesses/claude.py @@ -0,0 +1,93 @@ +"""Claude Code harness — ``claude -p`` headless (stream-json).""" + +from __future__ import annotations + +import json +import shlex +import time + +from lightcone.eval.harnesses.base import AgentResult, Harness, SandboxLike + + +class ClaudeHarness(Harness): + """Drives Claude Code. This is the original (and reference) seam: the trial + loop used to call this logic directly via ``EvalSandbox.exec_claude``.""" + + name = "claude" + + def install_commands(self) -> list[str]: + return [ + "curl -fsSL https://claude.ai/install.sh | bash" + " && cp /root/.local/bin/claude /usr/local/bin/claude", + ] + + def credential_env_keys(self) -> list[str]: + return ["CLAUDE_CODE_OAUTH_TOKEN", "ANTHROPIC_API_KEY"] + + def invoke( + self, + sandbox: SandboxLike, + *, + prompt_path: str, + work_dir: str, + max_turns: int, + timeout: int, + with_skills: bool, + model: str | None = None, + ) -> AgentResult: + # Claude Code consumes skills from the scaffolded `.claude/` dir, so the + # with/without-skills split is handled by `prepare` (file strip), not a + # flag here. `with_skills` is accepted for interface uniformity. + model_flag = f"--model {shlex.quote(model)}" if model else "" + cmd = ( + f"cd {work_dir} && " + f'claude -p "$(cat {shlex.quote(prompt_path)})" ' + f"--output-format stream-json --verbose " + f"--dangerously-skip-permissions " + f"--max-turns {max_turns} " + f"{model_flag}" + ).strip() + + start = time.monotonic() + result = sandbox.exec_async_poll(cmd, timeout=timeout) + duration_ms = int((time.monotonic() - start) * 1000) + return parse_claude_output(result.output, result.exit_code, duration_ms) + + +def parse_claude_output( + raw_output: str, exit_code: int, duration_ms: int +) -> AgentResult: + """Parse JSONL from ``claude -p --output-format stream-json``. + + One JSON object per line; the final ``{"type": "result", ...}`` line carries + the aggregate metrics. + """ + result = AgentResult(duration_ms=duration_ms, raw_jsonl=raw_output) + + if exit_code != 0: + result.is_error = True + result.result_text = raw_output + return result + + for raw_line in reversed(raw_output.strip().splitlines()): + stripped = raw_line.strip() + if not stripped or not stripped.startswith("{"): + continue + try: + data = json.loads(stripped) + if data.get("type") == "result": + result.cost_usd = float( + data.get("cost_usd", data.get("total_cost_usd", 0.0)) + ) + result.num_turns = int(data.get("num_turns", 0)) + result.duration_ms = int(data.get("duration_ms", duration_ms)) + result.result_text = str(data.get("result", "")) + result.is_error = bool(data.get("is_error", False)) + return result + except (json.JSONDecodeError, ValueError): + continue + + # No result line found + result.result_text = raw_output + result.is_error = True + return result diff --git a/src/lightcone/eval/harnesses/codex.py b/src/lightcone/eval/harnesses/codex.py new file mode 100644 index 00000000..189b8287 --- /dev/null +++ b/src/lightcone/eval/harnesses/codex.py @@ -0,0 +1,104 @@ +"""OpenAI Codex CLI harness — ``codex exec`` headless (JSONL).""" + +from __future__ import annotations + +import json +import shlex +import time + +from lightcone.eval.harnesses.base import AgentResult, Harness, SandboxLike + + +class CodexHarness(Harness): + """Drives the OpenAI Codex CLI. Mirrors :class:`ClaudeHarness`: a headless + ``codex exec`` run inside the sandbox, parsed into an ``AgentResult``.""" + + name = "codex" + + def install_commands(self) -> list[str]: + # The eval base image provides node/npm; the local binary is a brew cask + # but npm is the container path. + return ["npm install -g @openai/codex"] + + def credential_env_keys(self) -> list[str]: + return ["OPENAI_API_KEY", "CODEX_API_KEY"] + + # `prepare` is inherited from the base default (strip the scaffold for the + # without-skills arm). Note: Codex reads `AGENTS.md`, not `.claude/skills`, + # so "with-skills" guidance never actually reaches Codex — a known + # limitation tracked for LCR-85; we do not synthesize AGENTS.md here. + + def invoke( + self, + sandbox: SandboxLike, + *, + prompt_path: str, + work_dir: str, + max_turns: int, + timeout: int, + with_skills: bool, + model: str | None = None, + ) -> AgentResult: + # Codex has no native max-turns; the run is bounded by `timeout` only. + # `with_skills` is accepted for interface uniformity (see `prepare`). + model_flag = f"-m {shlex.quote(model)}" if model else "" + cmd = ( + f"codex exec --cd {shlex.quote(work_dir)} " + f"--dangerously-bypass-approvals-and-sandbox " + f"--skip-git-repo-check --json " + f"{model_flag} " + f"-o /tmp/codex-last-message.txt " + f'"$(cat {shlex.quote(prompt_path)})"' + ).strip() + + start = time.monotonic() + result = sandbox.exec_async_poll(cmd, timeout=timeout) + duration_ms = int((time.monotonic() - start) * 1000) + + last = sandbox.exec("cat /tmp/codex-last-message.txt") + last_message = last.output if last.exit_code == 0 else "" + + return parse_codex_output( + result.output, result.exit_code, duration_ms, last_message + ) + + +def parse_codex_output( + raw_jsonl: str, exit_code: int, duration_ms: int, last_message: str +) -> AgentResult: + """Parse JSONL from ``codex exec --json``. + + The event shape is unconfirmed, so this stays tolerant: bad lines are + skipped and never raise. Codex does not price a run, so ``cost_usd`` is + always ``0.0``. ``result_text`` prefers the ``-o`` last-message file, + falling back to the last non-empty stdout line. ``num_turns`` is a + best-effort count of assistant/turn events. + """ + result = AgentResult( + cost_usd=0.0, + duration_ms=duration_ms, + is_error=exit_code != 0, + raw_jsonl=raw_jsonl, + ) + + num_turns = 0 + last_nonempty_line = "" + for raw_line in raw_jsonl.splitlines(): + stripped = raw_line.strip() + if not stripped: + continue + last_nonempty_line = stripped + if not stripped.startswith("{"): + continue + try: + data = json.loads(stripped) + except (json.JSONDecodeError, ValueError): + continue + # Best-effort turn detection: tolerate several plausible event shapes. + event_type = str(data.get("type", "") or data.get("event", "")) + if "turn" in event_type or "assistant" in event_type: + num_turns += 1 + + result.num_turns = num_turns + result.result_text = last_message or last_nonempty_line + return result diff --git a/src/lightcone/eval/harnesses/pi.py b/src/lightcone/eval/harnesses/pi.py new file mode 100644 index 00000000..43be5abb --- /dev/null +++ b/src/lightcone/eval/harnesses/pi.py @@ -0,0 +1,133 @@ +"""pi harness — ``pi -p`` headless (``@earendil-works/pi-coding-agent``). + +Unlike Claude Code, pi gates skill loading via *invoke flags* (``--skill`` / +``--no-skills``), not via the on-disk scaffold. So this harness keeps the +Lightcone scaffold in place (no-op ``prepare``) and lets the with/without-skills +flags decide what the agent sees. +""" + +from __future__ import annotations + +import json +import shlex +import time + +from lightcone.eval.harnesses.base import AgentResult, Harness, SandboxLike + + +class PiHarness(Harness): + """Drives the pi coding agent (``@earendil-works/pi-coding-agent``). + + pi uses the process CWD as the project dir (no ``--cd`` flag), so every + command is prefixed with ``cd &&``. Skill loading is toggled at + invoke time, not by file-stripping. + """ + + name = "pi" + + def install_commands(self) -> list[str]: + return ["npm install -g @earendil-works/pi-coding-agent"] + + def credential_env_keys(self) -> list[str]: + return ["ANTHROPIC_API_KEY", "OPENAI_API_KEY", "GEMINI_API_KEY"] + + def prepare( + self, sandbox: SandboxLike, *, work_dir: str, with_skills: bool + ) -> None: + """No-op: pi gates skills via invoke flags (``--skill`` / ``--no-skills``), + not by stripping the on-disk scaffold. We keep the scaffold in place and + let the flags in ``invoke`` decide what the agent loads.""" + + def invoke( + self, + sandbox: SandboxLike, + *, + prompt_path: str, + work_dir: str, + max_turns: int, + timeout: int, + with_skills: bool, + model: str | None = None, + ) -> AgentResult: + # pi uses the process CWD as the project dir (no `--cd`), so prefix the + # command with `cd `. `--no-session` keeps the run ephemeral. + if with_skills: + # Load the lc-init scaffold's skills dir (relative to work_dir). + skill_flags = "--skill .claude/skills" + else: + # Bare run: disable skill discovery and context files (CLAUDE.md etc). + skill_flags = "--no-skills --no-context-files" + + model_flag = "" + if model: + model_flag = f"--model {shlex.quote(model)}" + # Heuristic: Claude-family model ids need the anthropic provider, since + # pi's default provider is google. Other ids fall through to whatever + # provider pi is configured with. + if any(tag in model.lower() for tag in ("claude", "sonnet", "opus", "haiku")): + model_flag = f"--provider anthropic {model_flag}" + + cmd = ( + f"cd {work_dir} && " + f'pi -p "$(cat {shlex.quote(prompt_path)})" ' + f"--mode json --no-session " + f"{skill_flags} " + f"{model_flag}" + ).strip() + + start = time.monotonic() + result = sandbox.exec_async_poll(cmd, timeout=timeout) + duration_ms = int((time.monotonic() - start) * 1000) + return parse_pi_output(result.output, result.exit_code, duration_ms) + + +def parse_pi_output(raw: str, exit_code: int, duration_ms: int) -> AgentResult: + """Parse output from ``pi -p --mode json``. + + The ``--mode json`` shape is UNCONFIRMED, so this is deliberately tolerant: + try ``json.loads`` on the whole stdout, then on the last non-empty line; pull + cost / turn / final-text fields under their plausible aliases. Never raises on + malformed JSON — falls back to the last non-empty line as ``result_text``. + """ + result = AgentResult(duration_ms=duration_ms, raw_jsonl=raw, is_error=exit_code != 0) + + lines = [ln.strip() for ln in raw.strip().splitlines() if ln.strip()] + last_line = lines[-1] if lines else "" + + data: dict | None = None + for candidate in (raw.strip(), last_line): + if not candidate: + continue + try: + parsed = json.loads(candidate) + except (json.JSONDecodeError, ValueError): + continue + if isinstance(parsed, dict): + data = parsed + break + + if data is None: + # No parseable JSON object — best-effort fall back to the last line. + result.result_text = last_line + return result + + cost = data.get("cost", data.get("cost_usd", data.get("total_cost_usd", 0.0))) + try: + result.cost_usd = float(cost) + except (TypeError, ValueError): + result.cost_usd = 0.0 + + turns = data.get("num_turns", data.get("turns", 0)) + try: + result.num_turns = int(turns) + except (TypeError, ValueError): + result.num_turns = 0 + + for key in ("result", "text", "content", "message"): + if key in data and data[key]: + result.result_text = str(data[key]) + break + else: + result.result_text = last_line + + return result diff --git a/src/lightcone/eval/models.py b/src/lightcone/eval/models.py index d7f9ad7f..6baa8d14 100644 --- a/src/lightcone/eval/models.py +++ b/src/lightcone/eval/models.py @@ -6,7 +6,18 @@ from enum import StrEnum from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, field_validator + + +class HarnessSpec(BaseModel): + """One agent harness in the run matrix, with an optional model override. + + Accepts a bare string in YAML (``- claude``) or a mapping + (``{name: codex, model: gpt-5.3-codex-spark}``). + """ + + name: str + model: str | None = None class GraderType(StrEnum): @@ -66,10 +77,12 @@ class GraderResult(BaseModel): class TrialResult(BaseModel): - """Result of a single trial (one task x repetition).""" + """Result of a single trial (one task x harness x skill-variant x rep).""" trial_id: str task_id: str + harness: str = "claude" + with_skills: bool = True trial_number: int = 0 started_at: datetime | None = None finished_at: datetime | None = None @@ -84,14 +97,37 @@ class TrialResult(BaseModel): class EvalRunConfig(BaseModel): - """Configuration for an eval run (loaded from run config YAML).""" + """Configuration for an eval run (loaded from run config YAML). + + The run is a matrix: ``tasks`` x ``harnesses`` x ``skill_variants`` x + ``num_trials``. ``skill_variants`` is the with/without-Lightcone A/B — + ``[true]`` for a single arm, ``[true, false]`` to measure the lift. + """ id: str = "" tasks: list[str] = Field(default_factory=list) + backend: str = "daytona" # "daytona" | "local_docker" + harnesses: list[HarnessSpec] = Field( + default_factory=lambda: [HarnessSpec(name="claude")] + ) + skill_variants: list[bool] = Field(default_factory=lambda: [True]) num_trials: int = 3 max_concurrency: int = 4 sandbox_image: str | None = None output_dir: str = "eval-results" + # Optional run-level overrides of the task's budget — handy for cheap + # plumbing smokes (cap turns so a trial exits fast without a full build). + max_turns: int | None = None + trial_timeout: int | None = None + + @field_validator("harnesses", mode="before") + @classmethod + def _coerce_harnesses(cls, v: object) -> object: + """Allow ``harnesses: [claude, {name: codex, model: ...}]`` — bare + strings become ``HarnessSpec(name=...)``.""" + if isinstance(v, list): + return [{"name": h} if isinstance(h, str) else h for h in v] + return v class VersionInfo(BaseModel): diff --git a/src/lightcone/eval/report.py b/src/lightcone/eval/report.py index 42a677d7..d2bb400d 100644 --- a/src/lightcone/eval/report.py +++ b/src/lightcone/eval/report.py @@ -189,6 +189,99 @@ def print_comparison_between( console.print(table) +def compute_matrix(eval_run: EvalRun) -> dict[tuple[str, str, bool], dict[str, Any]]: + """Group trials by ``(task, harness, with_skills)`` — the multi-harness A/B + matrix. Keys are tuples; values mirror ``compute_summary``'s per-group stats. + """ + groups: dict[tuple[str, str, bool], list[TrialResult]] = defaultdict(list) + for t in eval_run.trials: + groups[(t.task_id, t.harness, t.with_skills)].append(t) + + out: dict[tuple[str, str, bool], dict[str, Any]] = {} + for key, trials in groups.items(): + ok = [t for t in trials if t.error is None] + n = len(ok) + scores = [t.composite_score for t in ok] + out[key] = { + "task": key[0], + "harness": key[1], + "with_skills": key[2], + "num_trials": len(trials), + "num_errors": len(trials) - n, + "mean_score": round(sum(scores) / n, 4) if n else 0.0, + "pass_at_k": (sum(1 for t in ok if t.build_complete) / len(trials)) + if trials + else 0.0, + "mean_cost_usd": round(sum(t.total_cost_usd for t in ok) / n, 4) if n else 0.0, + "mean_duration_seconds": round( + sum(t.total_duration_seconds for t in ok) / n, 1 + ) + if n + else 0.0, + } + return out + + +def _matrix_cell(g: dict[str, Any] | None) -> str: + """Render one harness×variant cell: score, pass@k, and cost/errors if any.""" + if g is None: + return "—" + cell = f"{g['mean_score']:.2f} pass@k {g['pass_at_k']:.0%}" + extra = [] + if g["num_errors"]: + extra.append(f"[red]{g['num_errors']} err[/red]") + if g["mean_cost_usd"]: + extra.append(f"${g['mean_cost_usd']:.2f}") + return cell + (("\n" + " ".join(extra)) if extra else "") + + +def print_matrix_table(eval_run: EvalRun, console: Console | None = None) -> None: + """Print the task × harness × {with,without}-Lightcone matrix. + + The Δ column is the with−without **lift** — the A/B headline: how much the + Lightcone layer moved the score on each harness. Shown only when both arms + ran for a (task, harness) pair. + """ + if console is None: + console = Console() + + matrix = compute_matrix(eval_run) + if not matrix: + console.print("[yellow]No results to display.[/yellow]") + return + + tasks = sorted({k[0] for k in matrix}) + harnesses = sorted({k[1] for k in matrix}) + has_both = any((t, h, True) in matrix and (t, h, False) in matrix + for t in tasks for h in harnesses) + + table = Table(title="Eval Matrix: score by harness × Lightcone layer", show_lines=True) + table.add_column("Task", style="bold") + table.add_column("Harness", style="bold") + table.add_column("with skills", justify="center") + table.add_column("without skills", justify="center") + if has_both: + table.add_column("Δ lift", justify="center") + + for task in tasks: + for h in harnesses: + w = matrix.get((task, h, True)) + wo = matrix.get((task, h, False)) + if w is None and wo is None: + continue + row = [task, h, _matrix_cell(w), _matrix_cell(wo)] + if has_both: + if w and wo: + delta = w["mean_score"] - wo["mean_score"] + color = "green" if delta > 0 else ("red" if delta < 0 else "white") + row.append(f"[{color}]{delta:+.2f}[/{color}]") + else: + row.append("—") + table.add_row(*row) + + console.print(table) + + def save_results(eval_run: EvalRun, output_dir: str | Path) -> Path: """Save full EvalRun to JSON inside the run's sidecar directory.""" output_dir = Path(output_dir) diff --git a/src/lightcone/eval/sandbox.py b/src/lightcone/eval/sandbox.py index 6be41097..ac129edc 100644 --- a/src/lightcone/eval/sandbox.py +++ b/src/lightcone/eval/sandbox.py @@ -2,7 +2,6 @@ from __future__ import annotations -import json import logging import os import shlex @@ -11,10 +10,23 @@ from pathlib import Path from typing import Any +from lightcone.eval.harnesses.base import AgentResult +from lightcone.eval.harnesses.claude import parse_claude_output + logger = logging.getLogger(__name__) BUILD_COMPLETE_MARKER = "BUILD_COMPLETE" +# Idle-minutes backstop before Daytona auto-stops a sandbox. Teardown is the +# primary cleanup path (``teardown()`` deletes the sandbox), but any path where +# teardown does NOT run — an unhandled exception, the eval process being killed, +# a cancelled CI job — would otherwise leak a sandbox that runs forever and +# burns the compute budget (LCR-131). A non-zero auto-stop is the safety net: +# an active trial polls Daytona every ~10s so it never goes idle, while an +# orphaned sandbox stops after this many idle minutes. NOT zero — zero disables +# the net entirely, which is the bug. +AUTO_STOP_BACKSTOP_MINUTES = 30 + @dataclass class ExecuteResult: @@ -24,16 +36,11 @@ class ExecuteResult: output: str -@dataclass -class ClaudeResult: - """Parsed result from a claude -p invocation.""" - - cost_usd: float = 0.0 - num_turns: int = 0 - duration_ms: int = 0 - result_text: str = "" - is_error: bool = False - raw_jsonl: str = "" +# Back-compat alias: the harness layer now owns the normalized agent-result +# type. `_parse_claude_output` is re-exported from the Claude harness so callers +# (and tests) that imported it from here keep working. +ClaudeResult = AgentResult +_parse_claude_output = parse_claude_output @dataclass @@ -140,7 +147,9 @@ def create(self) -> None: image=image, labels=labels, env_vars=sandbox_env, - auto_stop_interval=0, # disable auto-stop; sandbox is deleted in teardown + # Backstop against leaked sandboxes (see AUTO_STOP_BACKSTOP_MINUTES); + # teardown() still deletes the sandbox on the normal path. + auto_stop_interval=AUTO_STOP_BACKSTOP_MINUTES, ) def _on_build_log(line: str) -> None: @@ -243,31 +252,26 @@ def exec_claude( max_turns: int = 25, timeout: int = 600, model: str | None = None, - ) -> ClaudeResult: - """Run claude -p with the loop prompt and parse JSON output. + ) -> AgentResult: + """Run Claude Code headless against the staged loop prompt. - Uses Daytona's session API with async execution + polling to avoid - HTTP connection timeouts on long-running Claude Code invocations. + Thin back-compat wrapper over :class:`ClaudeHarness`. The multi-harness + trial loop calls ``harness.invoke(sandbox, ...)`` directly; this stays so + existing callers and tests keep working. """ - assert self._sandbox is not None, "Call create() first" - - model_flag = f"--model {shlex.quote(model)}" if model else "" - cmd = ( - f"cd {self.WORK_DIR} && " - f"claude -p \"$(cat /tmp/loop-prompt.md)\" " - f"--output-format stream-json --verbose " - f"--dangerously-skip-permissions " - f"--max-turns {max_turns} " - f"{model_flag}" - ).strip() - - start = time.monotonic() - result = self._exec_async_poll(cmd, timeout=timeout) - duration_ms = int((time.monotonic() - start) * 1000) - - return _parse_claude_output(result.output, result.exit_code, duration_ms) + from lightcone.eval.harnesses.claude import ClaudeHarness + + return ClaudeHarness().invoke( + self, + prompt_path="/tmp/loop-prompt.md", + work_dir=self.WORK_DIR, + max_turns=max_turns, + timeout=timeout, + with_skills=True, + model=model, + ) - def _exec_async_poll( + def exec_async_poll( self, cmd: str, timeout: int = 600, poll_interval: int = 10 ) -> ExecuteResult: """Execute a command asynchronously via Daytona sessions and poll for completion. @@ -365,42 +369,3 @@ def _upload_directory(self, local_dir: Path, remote_dir: str) -> None: rel = local_path.relative_to(local_dir) remote_path = f"{remote_dir}/{rel}" self.upload_file(remote_path, local_path.read_bytes()) - - -def _parse_claude_output( - raw_output: str, exit_code: int, duration_ms: int -) -> ClaudeResult: - """Parse JSONL output from claude -p --output-format stream-json. - - The stream-json format emits one JSON object per line. The final line - with ``{"type": "result", ...}`` contains the aggregate metrics. - """ - result = ClaudeResult(duration_ms=duration_ms, raw_jsonl=raw_output) - - if exit_code != 0: - result.is_error = True - result.result_text = raw_output - return result - - for raw_line in reversed(raw_output.strip().splitlines()): - stripped = raw_line.strip() - if not stripped or not stripped.startswith("{"): - continue - try: - data = json.loads(stripped) - if data.get("type") == "result": - result.cost_usd = float( - data.get("cost_usd", data.get("total_cost_usd", 0.0)) - ) - result.num_turns = int(data.get("num_turns", 0)) - result.duration_ms = int(data.get("duration_ms", duration_ms)) - result.result_text = str(data.get("result", "")) - result.is_error = bool(data.get("is_error", False)) - return result - except (json.JSONDecodeError, ValueError): - continue - - # No result line found - result.result_text = raw_output - result.is_error = True - return result diff --git a/tests/test_eval_harness.py b/tests/test_eval_harness.py index 2e37b066..dc6f389b 100644 --- a/tests/test_eval_harness.py +++ b/tests/test_eval_harness.py @@ -17,6 +17,7 @@ ) from lightcone.eval.models import ( EvalRunConfig, + HarnessSpec, TaskSpec, ) from lightcone.eval.sandbox import ( @@ -99,12 +100,15 @@ def test_custom_prompt(self, evals_dir: Path): class TestRunTrial: @patch("lightcone.eval.harness.EvalSandbox") - def test_successful_trial(self, mock_sandbox_cls: MagicMock, evals_dir: Path): + @patch("lightcone.eval.harness.get_harness") + def test_successful_trial( + self, mock_get_harness: MagicMock, mock_sandbox_cls: MagicMock, evals_dir: Path + ): """Test a trial that completes successfully.""" sandbox_instance = mock_sandbox_cls.return_value sandbox_instance.WORK_DIR = "/home/user/project" - sandbox_instance.exec_claude.return_value = ClaudeResult( + mock_get_harness.return_value.invoke.return_value = ClaudeResult( cost_usd=0.05, num_turns=10, duration_ms=5000, @@ -122,7 +126,8 @@ def test_successful_trial(self, mock_sandbox_cls: MagicMock, evals_dir: Path): config = EvalRunConfig(id="test-run") trial = run_trial( - task, 0, evals_dir=evals_dir, config=config, run_id="r1", wheels=[], + task, 0, harness_spec=HarnessSpec(name="claude"), with_skills=True, + evals_dir=evals_dir, config=config, run_id="r1", wheels=[], ) assert trial.build_complete is True @@ -141,7 +146,8 @@ def test_trial_with_error(self, mock_sandbox_cls: MagicMock, evals_dir: Path): config = EvalRunConfig(id="test-run") trial = run_trial( - task, 0, evals_dir=evals_dir, config=config, run_id="r1", wheels=[], + task, 0, harness_spec=HarnessSpec(name="claude"), with_skills=True, + evals_dir=evals_dir, config=config, run_id="r1", wheels=[], ) assert trial.error is not None @@ -149,12 +155,15 @@ def test_trial_with_error(self, mock_sandbox_cls: MagicMock, evals_dir: Path): sandbox_instance.teardown.assert_called_once() @patch("lightcone.eval.harness.EvalSandbox") - def test_trial_incomplete(self, mock_sandbox_cls: MagicMock, evals_dir: Path): + @patch("lightcone.eval.harness.get_harness") + def test_trial_incomplete( + self, mock_get_harness: MagicMock, mock_sandbox_cls: MagicMock, evals_dir: Path + ): """Test a trial where the build does not complete.""" sandbox_instance = mock_sandbox_cls.return_value sandbox_instance.WORK_DIR = "/home/user/project" - sandbox_instance.exec_claude.return_value = ClaudeResult( + mock_get_harness.return_value.invoke.return_value = ClaudeResult( cost_usd=0.02, num_turns=5, duration_ms=3000, @@ -171,7 +180,8 @@ def test_trial_incomplete(self, mock_sandbox_cls: MagicMock, evals_dir: Path): config = EvalRunConfig(id="test-run") trial = run_trial( - task, 0, evals_dir=evals_dir, config=config, run_id="r1", wheels=[], + task, 0, harness_spec=HarnessSpec(name="claude"), with_skills=True, + evals_dir=evals_dir, config=config, run_id="r1", wheels=[], ) assert trial.build_complete is False @@ -181,13 +191,20 @@ def test_trial_incomplete(self, mock_sandbox_cls: MagicMock, evals_dir: Path): class TestSidecarFiles: @patch("lightcone.eval.harness.EvalSandbox") - def test_sidecar_written(self, mock_sandbox_cls: MagicMock, evals_dir: Path, tmp_path: Path): + @patch("lightcone.eval.harness.get_harness") + def test_sidecar_written( + self, + mock_get_harness: MagicMock, + mock_sandbox_cls: MagicMock, + evals_dir: Path, + tmp_path: Path, + ): """Test that JSONL sidecar files are written when sidecar_dir is provided.""" sandbox_instance = mock_sandbox_cls.return_value sandbox_instance.WORK_DIR = "/home/user/project" raw_jsonl = '{"type":"assistant","message":"hello"}\n{"type":"result","cost_usd":0.05}\n' - sandbox_instance.exec_claude.return_value = ClaudeResult( + mock_get_harness.return_value.invoke.return_value = ClaudeResult( cost_usd=0.05, num_turns=3, duration_ms=1000, result_text=BUILD_COMPLETE_MARKER, is_error=False, raw_jsonl=raw_jsonl, @@ -202,7 +219,8 @@ def test_sidecar_written(self, mock_sandbox_cls: MagicMock, evals_dir: Path, tmp sidecar_dir = tmp_path / "logs" trial = run_trial( - task, 0, evals_dir=evals_dir, config=config, + task, 0, harness_spec=HarnessSpec(name="claude"), with_skills=True, + evals_dir=evals_dir, config=config, run_id="r1", wheels=[], sidecar_dir=sidecar_dir, ) @@ -212,12 +230,15 @@ def test_sidecar_written(self, mock_sandbox_cls: MagicMock, evals_dir: Path, tmp assert full_path.read_text() == raw_jsonl @patch("lightcone.eval.harness.EvalSandbox") - def test_no_sidecar_without_dir(self, mock_sandbox_cls: MagicMock, evals_dir: Path): + @patch("lightcone.eval.harness.get_harness") + def test_no_sidecar_without_dir( + self, mock_get_harness: MagicMock, mock_sandbox_cls: MagicMock, evals_dir: Path + ): """transcript_path stays None when no sidecar_dir is given.""" sandbox_instance = mock_sandbox_cls.return_value sandbox_instance.WORK_DIR = "/home/user/project" - sandbox_instance.exec_claude.return_value = ClaudeResult( + mock_get_harness.return_value.invoke.return_value = ClaudeResult( cost_usd=0.01, num_turns=1, duration_ms=100, result_text=BUILD_COMPLETE_MARKER, is_error=False, raw_jsonl='{"type":"result"}\n', @@ -231,7 +252,8 @@ def test_no_sidecar_without_dir(self, mock_sandbox_cls: MagicMock, evals_dir: Pa config = EvalRunConfig(id="test-run") trial = run_trial( - task, 0, evals_dir=evals_dir, config=config, run_id="r1", wheels=[], + task, 0, harness_spec=HarnessSpec(name="claude"), with_skills=True, + evals_dir=evals_dir, config=config, run_id="r1", wheels=[], ) assert trial.iterations[0].transcript_path is None