diff --git a/experiments/skill_from_trajectory.py b/experiments/skill_from_trajectory.py new file mode 100644 index 00000000..77d2a952 --- /dev/null +++ b/experiments/skill_from_trajectory.py @@ -0,0 +1,515 @@ +"""Experiment: skill-from-trajectory vs guidelines vs no-recall. + +Per trial: + 1. Seed run on a fresh workspace (utterance 1) — produces guidelines via + `learn` and a saved trajectory. + 2. Synthesis run on the same workspace — invokes the new + `/evolve-lite:synthesize-skill` skill on the seed trajectory; produces + `.evolve/skills//` and `.claude/skills//`. + 3. Branch into three measure conditions, each a fresh copy of demo/workspace + plus the relevant memory: + - no_recall: nothing + - guidelines: seeded workspace's `.evolve/entities/` (no skills) + - skill: seeded workspace's `.claude/skills/` (no guidelines) + 4. For each condition, run each measure utterance once. Capture token usage, + duration, and the skill the model invoked (if any). + +Results: experiments/results/skill_from_trajectory_/ + - report.md three-way × per-utterance comparison table + - raw.json full per-run usage payloads + tool-call summaries + - synthesized_skills/ copy of each trial's synthesized skill dir + +Usage: + python3 experiments/skill_from_trajectory.py [--trials 5] +""" + +from __future__ import annotations + +import argparse +import json +import os +import shlex +import shutil +import statistics +import subprocess +import sys +import time +from datetime import datetime, timezone +from pathlib import Path + +# Reuse helpers from the existing token-savings experiment. +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from token_savings import ( # type: ignore[import-not-found] # noqa: E402 + FORWARDED_ENV_VARS, + REPO_ROOT, + SANDBOX_IMAGE, + SESSION_TIMEOUT_SECONDS, + _check_prerequisites, + _extract_usage as _extract_usage_base, + _newest_transcript, + _per_turn_usage, +) + + +def _extract_usage(parsed: dict | None) -> dict: + """Extend the base extractor with total_cost_usd, which we report per-trial.""" + out: dict = _extract_usage_base(parsed) + if parsed is not None: + out["total_cost_usd"] = parsed.get("total_cost_usd") + return out + + +# All EXIF utterances, indexed by short key. The default seed is `gps` and the +# default measure set is all three; --seed-utterances can override the seed +# set (e.g. `gps,focal_length`) to test two-utterance seeding. +UTTERANCES: dict[str, str] = { + "gps": "where was the photo @sample.jpg taken. use exif metadata", + "focal_length": "what focal length was used to take the photo @sample.jpg. use exif metadata", + "lens": "what lens model was used for @sample.jpg. use exif metadata", +} + +# Default single-utterance seed (run A behavior). +DEFAULT_SEED_KEYS: list[str] = ["gps"] + +# Default measure set (kept here for back-compat with the report-builder). +MEASURE_UTTERANCES: dict[str, str] = dict(UTTERANCES) + +CONDITIONS = ("no_recall", "guidelines", "skill") + + +def _docker_path(p: Path) -> str: + """Resolve a path for Docker bind-mounting on macOS. + + Docker on macOS doesn't follow the /tmp -> /private/tmp symlink for + subdirectories: mounting /tmp/foo/bar lets the container see /tmp/foo + but not its contents. Resolve to the real path before mounting. + """ + return str(p.resolve()) + + +def _run_sandbox_prompt_json(workspace: Path, prompt: str) -> tuple[subprocess.CompletedProcess, dict | None]: + """Run a prompt with --output-format json and return (proc, parsed_json). + + Local copy of the helper from token_savings.py, but resolves the + workspace path before binding (see _docker_path). + """ + plugins = REPO_ROOT / "platform-integrations" / "claude" / "plugins" + command = "claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions --output-format json -p " + shlex.quote(prompt) + cmd = ["docker", "run", "--rm"] + for var in FORWARDED_ENV_VARS: + if os.environ.get(var): + cmd += ["-e", var] + cmd += [ + "-e", + "EVOLVE_DEBUG=1", + "-v", + f"{_docker_path(workspace)}:/workspace", + "-v", + f"{_docker_path(plugins)}:/plugins", + SANDBOX_IMAGE, + "bash", + "-c", + command, + ] + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS) + parsed: dict | None = None + if proc.returncode == 0 and proc.stdout.strip(): + try: + parsed = json.loads(proc.stdout) + except json.JSONDecodeError: + for line in reversed(proc.stdout.splitlines()): + line = line.strip() + if line.startswith("{") and line.endswith("}"): + try: + parsed = json.loads(line) + break + except json.JSONDecodeError: + continue + return proc, parsed + + +def _fresh_workspace(tmp_root: Path, label: str) -> Path: + """Copy demo/workspace into tmp_root/