From 6545389ff5c6225c635e1c1553e0d792be8f9c3e Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Thu, 25 Jun 2026 15:11:57 +0000 Subject: [PATCH 1/2] chore: remove the Python implementation The Rust crate now covers the whole pipeline (run, proxy, sandbox, drivers, export, inspect, ls), so retire the Python package and its tooling: - delete src/agentcap/, pyproject.toml, and the pytest suite (tests/*.py) - drop the Python CI workflows (linux-live-tests.yml, linux-non-live-tests.yml); the Rust Test / Test - Live / Build - Release workflows replace them - prune Python-only entries from .gitignore - fix the crate doc and a few comments that still described the Python half Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/linux-live-tests.yml | 60 - .github/workflows/linux-non-live-tests.yml | 43 - .github/workflows/live.yml | 4 +- .github/workflows/test.yml | 2 +- .gitignore | 11 +- pyproject.toml | 54 - src/agentcap/__init__.py | 3 - src/agentcap/__main__.py | 2684 ------------------- src/agentcap/captures.py | 190 -- src/agentcap/drivers/__init__.py | 174 -- src/agentcap/drivers/goose.py | 125 - src/agentcap/drivers/hermes.py | 204 -- src/agentcap/drivers/opencode.py | 236 -- src/agentcap/drivers/pi.py | 183 -- src/agentcap/export.py | 686 ----- src/agentcap/followups/__init__.py | 59 - src/agentcap/followups/continue_.py | 15 - src/agentcap/followups/synthesized.py | 127 - src/agentcap/followups/templates.py | 28 - src/agentcap/orchestrator.py | 220 -- src/agentcap/provider.py | 144 - src/agentcap/proxy.py | 415 --- src/agentcap/sandbox/__init__.py | 116 - src/agentcap/sandbox/podman.py | 189 -- src/agentcap/sandbox/podman_provisioning.py | 207 -- src/agentcap/scan.py | 247 -- src/lib.rs | 6 +- src/provider.rs | 8 +- tests/__init__.py | 0 tests/conftest.py | 514 ---- tests/fixtures/__init__.py | 0 tests/fixtures/sandbox_images.py | 134 - tests/live.rs | 9 +- tests/test_captures.py | 127 - tests/test_cli.py | 451 ---- tests/test_cli_live.py | 106 - tests/test_drivers.py | 239 -- tests/test_drivers_live.py | 110 - tests/test_export.py | 541 ---- tests/test_followups.py | 186 -- tests/test_inspect_helpers.py | 158 -- tests/test_orchestrator.py | 292 -- tests/test_podman_sandbox.py | 229 -- tests/test_provider.py | 108 - tests/test_proxy.py | 327 --- tests/test_proxy_http.py | 276 -- tests/test_proxy_meta.py | 250 -- tests/test_sandbox.py | 21 - tests/test_scan.py | 251 -- 49 files changed, 14 insertions(+), 10755 deletions(-) delete mode 100644 .github/workflows/linux-live-tests.yml delete mode 100644 .github/workflows/linux-non-live-tests.yml delete mode 100644 pyproject.toml delete mode 100644 src/agentcap/__init__.py delete mode 100644 src/agentcap/__main__.py delete mode 100644 src/agentcap/captures.py delete mode 100644 src/agentcap/drivers/__init__.py delete mode 100644 src/agentcap/drivers/goose.py delete mode 100644 src/agentcap/drivers/hermes.py delete mode 100644 src/agentcap/drivers/opencode.py delete mode 100644 src/agentcap/drivers/pi.py delete mode 100644 src/agentcap/export.py delete mode 100644 src/agentcap/followups/__init__.py delete mode 100644 src/agentcap/followups/continue_.py delete mode 100644 src/agentcap/followups/synthesized.py delete mode 100644 src/agentcap/followups/templates.py delete mode 100644 src/agentcap/orchestrator.py delete mode 100644 src/agentcap/provider.py delete mode 100644 src/agentcap/proxy.py delete mode 100644 src/agentcap/sandbox/__init__.py delete mode 100644 src/agentcap/sandbox/podman.py delete mode 100644 src/agentcap/sandbox/podman_provisioning.py delete mode 100644 src/agentcap/scan.py delete mode 100644 tests/__init__.py delete mode 100644 tests/conftest.py delete mode 100644 tests/fixtures/__init__.py delete mode 100644 tests/fixtures/sandbox_images.py delete mode 100644 tests/test_captures.py delete mode 100644 tests/test_cli.py delete mode 100644 tests/test_cli_live.py delete mode 100644 tests/test_drivers.py delete mode 100644 tests/test_drivers_live.py delete mode 100644 tests/test_export.py delete mode 100644 tests/test_followups.py delete mode 100644 tests/test_inspect_helpers.py delete mode 100644 tests/test_orchestrator.py delete mode 100644 tests/test_podman_sandbox.py delete mode 100644 tests/test_provider.py delete mode 100644 tests/test_proxy.py delete mode 100644 tests/test_proxy_http.py delete mode 100644 tests/test_proxy_meta.py delete mode 100644 tests/test_sandbox.py delete mode 100644 tests/test_scan.py diff --git a/.github/workflows/linux-live-tests.yml b/.github/workflows/linux-live-tests.yml deleted file mode 100644 index d2a9465..0000000 --- a/.github/workflows/linux-live-tests.yml +++ /dev/null @@ -1,60 +0,0 @@ -name: linux-live-tests - -on: - push: - branches: [main] - pull_request: - branches: [main] - -jobs: - tests: - # Live end-to-end tests: build per-agent sandbox images, spawn - # the llama.cpp server as a sibling podman container, and run - # real agent CLIs against it. Failures here block merges — these - # are the only tests that exercise the full sandbox + proxy + - # agent stack end-to-end. - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v6.0.3 - - uses: actions/setup-python@v6 - with: - python-version: "3.12" - cache: pip - - name: Install system deps (podman) - # Required by ``agentcap.sandbox.podman``. Ships in Ubuntu's - # default apt sources; rootless mode uses the runner user. - run: | - sudo apt-get update - sudo apt-get install -y podman - - name: Install Python deps - run: | - python -m pip install --upgrade pip - pip install -e '.[dev]' - - name: Cache GGUF weights - # The default test GGUF (Qwen3-1.7B Q8_0, ~1.8 GB) lands in - # the HF hub cache. Caching it keeps live runs cheap after - # the first download. - uses: actions/cache@v5 - with: - path: ~/.cache/huggingface - key: hf-hub-gguf-${{ hashFiles('tests/conftest.py') }} - restore-keys: | - hf-hub-gguf- - - name: Cache sandbox images - # Per-agent sandbox images (hermes is the heaviest at ~900 MB) - # are built on first use and stamped with a hash label. Caching - # the rootless containers/storage between runs lets - # ``ensure_image`` short-circuit on hash match. - uses: actions/cache@v5 - with: - path: ~/.local/share/containers - key: sandbox-images-${{ hashFiles('containers/**') }} - restore-keys: | - sandbox-images- - - name: Pre-build sandbox images - # Idempotent build of every per-agent image. On a cache hit - # this is a no-op (hash match in ``ensure_image``); on a miss - # it builds and the next run reuses the cache. - run: python tests/fixtures/sandbox_images.py - - name: Run live tests - run: pytest -m live tests/ -v diff --git a/.github/workflows/linux-non-live-tests.yml b/.github/workflows/linux-non-live-tests.yml deleted file mode 100644 index dfda9d8..0000000 --- a/.github/workflows/linux-non-live-tests.yml +++ /dev/null @@ -1,43 +0,0 @@ -name: linux-non-live-tests - -on: - push: - branches: [main] - pull_request: - branches: [main] - -jobs: - tests: - # Lint + unit tests on the default Ubuntu runner. The live tests - # live in ``linux-live-tests.yml`` and run on a GPU runner so the - # local ``llama serve`` doesn't stall on CPU inference. - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v6.0.3 - - uses: actions/setup-python@v6 - with: - python-version: "3.12" - cache: pip - - name: Install Python deps - run: | - python -m pip install --upgrade pip - pip install -e '.[dev]' - - name: Install trufflehog - # Version pinned: detector-output changes (false-positive - # bucket) would silently flip test_scan / test_cli scan-on - # assertions. - env: - TRUFFLEHOG_VERSION: 3.95.3 - TRUFFLEHOG_SHA256: 5d836eae522540a32ca0f1a1e00efd4c3153a52462466a4b4008fac1e6c1a548 - run: | - set -euo pipefail - cd /tmp - curl -sSfL -O "https://github.com/trufflesecurity/trufflehog/releases/download/v${TRUFFLEHOG_VERSION}/trufflehog_${TRUFFLEHOG_VERSION}_linux_amd64.tar.gz" - echo "${TRUFFLEHOG_SHA256} trufflehog_${TRUFFLEHOG_VERSION}_linux_amd64.tar.gz" | sha256sum -c - - mkdir -p "$HOME/.local/bin" - tar -xzf "trufflehog_${TRUFFLEHOG_VERSION}_linux_amd64.tar.gz" -C "$HOME/.local/bin" trufflehog - echo "$HOME/.local/bin" >> "$GITHUB_PATH" - - name: Lint (ruff) - run: ruff check . - - name: Tests (unit only, no live) - run: pytest -m "not live" tests/ diff --git a/.github/workflows/live.yml b/.github/workflows/live.yml index 7023677..f8efa3b 100644 --- a/.github/workflows/live.yml +++ b/.github/workflows/live.yml @@ -3,7 +3,7 @@ name: Test - Live # Full agent×model end-to-end: spin a real llama.cpp server, build the per-agent # sandbox images on demand, and drive `agentcap run` against the server for each # agent (pi/hermes/goose). Heavy (GGUF download + image builds + CPU inference) — -# this is agentcap's "live" tier, the Rust port of `linux-live-tests.yml`. +# this is agentcap's "live" tier (real agent × model, end-to-end). on: push: @@ -29,7 +29,7 @@ permissions: env: CARGO_TERM_COLOR: always RUST_BACKTRACE: "1" - # Pinned to match tests/conftest.py (the proven Python live setup). + # Pinned to the proven live setup (Qwen3-1.7B-Q8 on a CPU llama.cpp server). GGUF_REPO: Qwen/Qwen3-1.7B-GGUF GGUF_FILE: Qwen3-1.7B-Q8_0.gguf LLAMA_IMAGE: ghcr.io/ggml-org/llama.cpp:server-b9487 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6549d2d..767cd0b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,7 +2,7 @@ name: Test - Unit & Integration # Hermetic tests only: unit tests + the loopback proxy integration test. No # network, podman, or model server. The full agent×model end-to-end ("live") -# tests are a separate, resource-heavy category — see `linux-live-tests.yml`. +# tests are a separate, resource-heavy category — see `live.yml`. on: pull_request: diff --git a/.gitignore b/.gitignore index e77b1c4..51f99e5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,19 +1,10 @@ -__pycache__/ -*.pyc -.venv/ -.venv*/ -build/ -dist/ -*.egg-info/ runs/ .agentcap/ examples/*/sandbox/ -.pytest_cache/ -.ruff_cache/ .vscode/ .idea/ .DS_Store # Rust /target/ - +dist/ diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 6e52a0e..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,54 +0,0 @@ -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[project] -name = "agentcap" -version = "0.0.1" -description = "Run coding agents at scale across (agent × model × corpus), capture every chat-completion byte, publish as a Hugging Face dataset." -readme = "README.md" -requires-python = ">=3.10" -license = "Apache-2.0" -license-files = ["LICENSE"] -keywords = ["llm", "agent", "capture", "dataset", "huggingface", "kv-cache"] -authors = [ - { name = "David Corvoysier", email = "david@huggingface.co" }, -] - -dependencies = [ - "httpx>=0.27", - # Floor pinned to address CVE-2026-48710 (GHSA-86qp-5c8j-p5mr): - # Host header poisons request.url.path; path-based middleware can be bypassed. - "starlette>=1.0.1", - "uvicorn>=0.30", - "huggingface_hub>=1.13", # HfApi.upload_file with repo_type="dataset" - "pyyaml>=6", # used by HermesDriver to overlay context_length / base_url into config.yaml - "click>=8.1", - "pyarrow>=15", # streaming ParquetWriter in export_local - "tqdm>=4.60", # per-row progress in export_local -] - -[project.optional-dependencies] -dev = [ - "pytest>=8", - "pytest-asyncio>=0.24", - "ruff>=0.6", -] - -[project.urls] -Homepage = "https://github.com/huggingface/agentcap" -Repository = "https://github.com/huggingface/agentcap" -Issues = "https://github.com/huggingface/agentcap/issues" - -[project.scripts] -agentcap = "agentcap.__main__:main" - -[tool.hatch.build.targets.wheel] -packages = ["src/agentcap"] - -[tool.pytest.ini_options] -testpaths = ["tests"] -markers = [ - "integration: tests that spin up real uvicorn servers and talk over TCP loopback (slower)", - "live: integration tests that invoke a real agent CLI against a real model server (skipped unless agent binaries + AGENTCAP_TEST_LLM_URL or AGENTCAP_TEST_GGUF are configured)", -] diff --git a/src/agentcap/__init__.py b/src/agentcap/__init__.py deleted file mode 100644 index 6b66323..0000000 --- a/src/agentcap/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""agentcap: capture LLM-agent chat-completion bytes, export as HF datasets.""" - -__version__ = "0.0.1" diff --git a/src/agentcap/__main__.py b/src/agentcap/__main__.py deleted file mode 100644 index c04c472..0000000 --- a/src/agentcap/__main__.py +++ /dev/null @@ -1,2684 +0,0 @@ -"""CLI entrypoint. See ``agentcap --help`` for the subcommand list.""" - -from __future__ import annotations - -import functools -import os -import sys -from collections.abc import Sequence -from pathlib import Path -from urllib.parse import urlparse - -import click - -from . import __version__ -from .drivers import known_drivers as _known_drivers - - -@click.group(context_settings={"help_option_names": ["-h", "--help"]}) -@click.version_option(__version__, prog_name="agentcap") -def cli() -> None: - """agentcap: capture LLM-agent chat-completion bytes.""" - - -def _is_hf_router_upstream(upstream: str) -> bool: - host = (urlparse(upstream).hostname or "").lower() - return host == "router.huggingface.co" - - -def _read_hf_token_cache() -> str | None: - token_path = Path.home() / ".cache" / "huggingface" / "token" - try: - token = token_path.read_text().strip() - except OSError: - return None - return token or None - - -_WORKSPACE_DIR = ".agentcap" - - -def _workspace_source() -> tuple[Path, str]: - """Return the workspace root (without ``.agentcap`` suffix) and a - short label of where the value came from. Used by error messages - so the user can see, verbatim, what AGENTCAP_WORKSPACE resolved - to — catches shell typos like ``WORKSPACE==/path`` that leave a - leading ``=`` in the env var value.""" - env = os.environ.get("AGENTCAP_WORKSPACE") - if env is not None: - return Path(env), f"AGENTCAP_WORKSPACE={env!r}" - return Path(os.getcwd()), "cwd (AGENTCAP_WORKSPACE unset)" - - -def _workspace_root() -> Path: - base, _ = _workspace_source() - return base / _WORKSPACE_DIR - - -def _no_workspace_msg(workspace: Path) -> str: - _, src = _workspace_source() - return ( - f"no workspace at {str(workspace)!r} (from {src}). " - f"Run `agentcap run` first, or set AGENTCAP_WORKSPACE to a " - f"directory that contains a ``.agentcap/`` subdir." - ) - - -def _default_workdir(agent: str, provider_slug: str) -> Path: - import time - utc = time.strftime("%Y%m%d-%H%M%S", time.gmtime()) - slug = provider_slug.replace("/", "-") - return _workspace_root() / f"{agent}-{slug}-{utc}" - - -def _resolve_api_key( - *, upstream: str, explicit_api_key: str | None -) -> tuple[str | None, str | None]: - if explicit_api_key: - return explicit_api_key, "--api-key / AGENTCAP_API_KEY" - if not _is_hf_router_upstream(upstream): - return None, None - - hf_env = (os.environ.get("HF_TOKEN") or "").strip() - if hf_env: - return hf_env, "HF_TOKEN" - - cached = _read_hf_token_cache() - if cached: - return cached, "~/.cache/huggingface/token" - - return None, None - - -def _complete_run_ids(ctx, param, incomplete): - """Shell completion for workspace run-ids — always against cwd's - ``.agentcap/`` (inspect doesn't consult ``$AGENTCAP_WORKSPACE``).""" - root = Path.cwd() / _WORKSPACE_DIR - if not root.is_dir(): - return [] - return [ - d.name for d in root.iterdir() - if d.is_dir() and (d / "run.json").is_file() - and d.name.startswith(incomplete) - ] - - -def _complete_request_ids(ctx, param, incomplete): - """Shell completion for captured request-ids across cwd's workspace.""" - root = Path.cwd() / _WORKSPACE_DIR - if not root.is_dir(): - return [] - out: list[str] = [] - for run_dir in root.iterdir(): - captures = run_dir / "captures" - if not captures.is_dir(): - continue - for req in captures.glob(f"{incomplete}*.request.json"): - out.append(req.name.removesuffix(".request.json")) - return out - - -@cli.command("export") -@click.argument("targets", nargs=-1, shell_complete=_complete_run_ids) -@click.option( - "--all", "all_runs", is_flag=True, - help="Export every run in the workspace (mutually exclusive with positional run-ids).", -) -@click.option( - "--push", - required=True, - help="``/`` — the Hugging Face Collection base name. " - "Captures parquets land in ``/-captures``, raw session " - "traces in ``/-traces``, and both are added to a " - "Collection titled ```` under ````. Repos and " - "Collection are created on first push.", -) -@click.option( - "--no-scan", "no_scan", is_flag=True, - help="Skip the pre-export trufflehog secret scan. Off by default: " - "any **verified** secret in a target run dir aborts the export " - "before any push happens.", -) -def export_cmd( - targets: tuple[str, ...], all_runs: bool, push: str, no_scan: bool, -) -> None: - """Render captured runs into parquets + upload native traces, in - one shot. Pushes to a paired ``-captures``/``-traces`` dataset - grouped under a Collection. ``TARGETS`` is one or more run-ids - (resolved against the workspace) or paths to a workdir; ``--all`` - exports every run in the workspace. - """ - import json as _json - - from .export import ( - captures_repo_id, - detect_model, - ensure_collection, - parse_collection_base, - push_agent_traces_dataset, - push_captures_dataset, - ) - - try: - owner, base = parse_collection_base(push) - except ValueError as exc: - raise click.UsageError(str(exc)) - if all_runs and targets: - raise click.UsageError("pass --all OR positional run-ids, not both") - if not all_runs and not targets: - raise click.UsageError("specify one or more run-ids/paths, or pass --all") - - workspace = _workspace_root() - if all_runs: - if not workspace.is_dir(): - raise click.UsageError(_no_workspace_msg(workspace)) - targets = tuple( - d.name for d in sorted(workspace.iterdir()) - if d.is_dir() and (d / "run.json").is_file() - ) - if not targets: - raise click.UsageError(f"no runs in {workspace}") - - def _resolve(t: str) -> tuple[Path, str | None, str]: - """Return (capture_dir, agent_from_run_json, run_id) for a target. - run_id is the run-dir basename; it labels both the captures - rows and the traces-dataset folder.""" - # 1. run-id in the workspace. - candidate = workspace / t - if (candidate / "captures").is_dir(): - agent_from = None - meta = candidate / "run.json" - if meta.is_file(): - try: - agent_from = _json.loads(meta.read_text()).get("agent") - except (OSError, _json.JSONDecodeError): - pass - return candidate / "captures", agent_from, candidate.name - # 2. an arbitrary workdir path with captures/ subdir. - p = Path(t) - if (p / "captures").is_dir(): - agent_from = None - meta = p / "run.json" - if meta.is_file(): - try: - agent_from = _json.loads(meta.read_text()).get("agent") - except (OSError, _json.JSONDecodeError): - pass - return p / "captures", agent_from, p.name - # 3. a path that *is* a capture dir. - if p.is_dir() and any(p.glob("*.request.json")): - return p, None, p.parent.name - raise click.UsageError(f"can't resolve {t!r} to a capture dir") - - cap_items: list[dict] = [] - trace_items: list[dict] = [] - for t in targets: - cap_dir, agent, run_id = _resolve(t) - try: - model = detect_model(cap_dir) - except ValueError as exc: - raise click.UsageError(str(exc)) - if model is None: - if all_runs: - click.echo(f" [{t}] skipped (no captures)", err=True) - continue - raise click.UsageError( - f"{cap_dir} has no captured requests with a model field" - ) - cap_items.append({ - "capture_dir": cap_dir, "model": model, "agent": agent, - "run_id": run_id, - }) - # Traces dir is sibling to captures; missing/empty is fine — - # push_traces_dataset accepts it and just records 0 files. - traces_dir = cap_dir.parent / "traces" - trace_items.append({"traces_dir": traces_dir, "run_id": run_id}) - n_traces = sum(1 for _ in traces_dir.iterdir()) \ - if traces_dir.is_dir() else 0 - click.echo( - f" [{t}] (agent={agent or '?'}, model={model}, " - f"traces={n_traces})", - err=True, - ) - if not cap_items: - raise click.UsageError("no runs with captures to export") - - # Pre-export gate: refuse to push if any run carries a verified - # secret. Verification round-trips to each provider's API so - # ``verified`` is high-precision (real, live credential). - # Unverified hits are surfaced but don't block — pattern-only - # detectors hit a real false-positive rate on model output. - if not no_scan: - run_dirs = [Path(c["capture_dir"]).parent for c in cap_items] - n_verified = _scan_run_dirs(run_dirs, no_verification=False) - if n_verified > 0: - raise click.ClickException( - f"export aborted: trufflehog found {n_verified} verified " - "secret(s) — see output above. Inspect, redact, or pass " - "--no-scan to override." - ) - - cap_repo, n_rows_list = push_captures_dataset( - cap_items, owner=owner, base=base, - ) - click.echo( - f"agentcap export: pushed {sum(n_rows_list)} rows across " - f"{len(cap_items)} run(s) -> {cap_repo}", - err=True, - ) - - # Group traces by agent — one dataset per agent so the Hub - # viewer doesn't try to merge incompatible schemas. - by_agent: dict[str, list[dict]] = {} - for cap, tr in zip(cap_items, trace_items): - agent_name = cap.get("agent") or "unknown" - n = sum(1 for _ in tr["traces_dir"].iterdir()) \ - if tr["traces_dir"].is_dir() else 0 - if n == 0: - continue - by_agent.setdefault(agent_name, []).append(tr) - - traces_repos: list[str] = [] - for agent_name, tr_items in sorted(by_agent.items()): - tr_repo, n_files = push_agent_traces_dataset( - tr_items, owner=owner, base=base, agent=agent_name, - ) - traces_repos.append(tr_repo) - click.echo( - f"agentcap export: pushed {n_files} trace file(s) for " - f"{agent_name} across {len(tr_items)} run(s) -> {tr_repo}", - err=True, - ) - - slug = ensure_collection( - owner=owner, base=base, - repos=[captures_repo_id(owner, base), *traces_repos], - ) - click.echo( - f"agentcap export: collection -> https://huggingface.co/collections/{slug}", - err=True, - ) - - -@cli.command("run") -@click.option( - "--agent", - type=click.Choice(_known_drivers()), - required=True, - help="Agent driver to use.", -) -@click.option( - "--model", - default=None, - help="Model id the agent uses in its outbound requests (and that " - "the capture proxy records as the ``model`` field). Required for " - "all drivers — hermes used to default to its own built-in id, but " - "that made captures lie about which model was actually run.", -) -@click.option( - "--upstream", - required=True, - help="Base URL of the upstream model server (e.g. http://127.0.0.1:8000).", -) -@click.option( - "--api-key", - "api_key", - default=None, - envvar="AGENTCAP_API_KEY", - help="Bearer token forwarded to the upstream. Required for " - "authenticated providers (HF Router, OpenAI, Together, …); leave " - "unset for local servers that don't auth (llama serve, vLLM). " - "Falls back to AGENTCAP_API_KEY. For HF Router only, if unset " - "we also auto-try HF_TOKEN and ~/.cache/huggingface/token.", -) -@click.option( - "--sandbox", - "sandbox_dir", - default=None, - type=click.Path(exists=True, file_okay=False, dir_okay=True), - help="Host directory exposed as the agent's cwd (bind-mounted " - "writable into the per-agent container). Use this when the corpus " - "needs the agent to see real source — e.g. a transformers git " - "worktree for the transformers-coding-session corpus. If omitted, " - "an empty ``sandbox/`` is created next to ``captures/`` under the " - "auto-derived run dir.", -) -@click.option( - "--skills", - "skills_dir", - default=None, - type=click.Path(exists=True, file_okay=False, dir_okay=True), - help="Host directory containing a huggingface/skills-shaped " - "checkout (``agents/AGENTS.md`` + ``skills//SKILL.md``). " - "Bind-mounted read-only into the sandbox; the agent's " - "image-side entrypoint wires it into the agent's expected " - "discovery location (``~/.hermes/skills/`` for hermes; " - "``AGENTS.md`` + ``skills/`` symlinks in cwd for " - "opencode/goose/pi).", -) -@click.option( - "--tasks", - "tasks_file", - required=True, - type=click.Path(exists=True, dir_okay=False), - help="Plain-text file with one prompt per line (# comments + blank lines ignored).", -) -@click.option( - "--turns", - type=int, - default=1, - show_default=True, - help="Total turns per task (1 = no follow-ups).", -) -@click.option( - "--followup", - type=click.Choice(["continue", "templates", "synthesized"]), - default="continue", - show_default=True, - help="Follow-up strategy for turns 2..N.", -) -@click.option( - "--timeout", - type=float, - default=1200, - show_default=True, - help="Per-turn timeout in seconds.", -) -def run_cmd( - agent: str, - model: str | None, - upstream: str, - api_key: str | None, - sandbox_dir: str | None, - skills_dir: str | None, - tasks_file: str, - turns: int, - followup: str, - timeout: float, -) -> None: - """Drive an agent CLI through a corpus, capture, summarise.""" - import json - - from .drivers import get_driver, traces_dump_argv_for - from .followups import get_followup - from .orchestrator import Orchestrator, read_tasks_txt - from .provider import _hostname_fallback, refine_for_sub_provider - from .proxy import serve_in_thread - from .sandbox import require_sandbox_or_die - - if not model: - raise click.UsageError( - f"--model is required for --agent {agent}" - ) - - api_key, api_key_source = _resolve_api_key( - upstream=upstream, - explicit_api_key=api_key, - ) - if api_key_source and _is_hf_router_upstream(upstream): - click.echo( - f" [auth] HF Router token source={api_key_source}", - err=True, - ) - - if followup == "synthesized": - fu = get_followup( - "synthesized", upstream=upstream, model=model, api_key=api_key - ) - else: - fu = get_followup(followup) - - # --- sandbox setup: from here on, side effects. - - def _sb_log(msg: str) -> None: - click.echo(f" [sandbox] {msg}", err=True) - - # Hostname classification — used by the sandbox env to pick the - # agent's credential channel (env-var auth vs no-auth) and as part - # of the auto-generated workdir name. - provider_slug = refine_for_sub_provider( - _hostname_fallback(upstream), model - ) - click.echo(f" [provider] {provider_slug}", err=True) - - workdir_p = _default_workdir(agent, provider_slug) - captures = workdir_p / "captures" - sessions = workdir_p / "sessions" - traces = workdir_p / "traces" - state = workdir_p / "state" - captures.mkdir(parents=True, exist_ok=True) - sessions.mkdir(parents=True, exist_ok=True) - traces.mkdir(parents=True, exist_ok=True) - state.mkdir(parents=True, exist_ok=True) - click.echo(f" [workdir] {workdir_p}", err=True) - - # Stub run.json so ``agentcap ls/inspect/export`` can discover this - # run while it's still in flight. Fully overwritten with the final - # summary (incl. per-task durations) at end-of-run. - (workdir_p / "run.json").write_text(json.dumps({ - "agent": agent, - "model": model, - "provider": provider_slug, - "upstream": upstream, - "turns_per_task": turns, - "followup": followup, - "tasks": [], - }, indent=2)) - - # Resolve --sandbox up front: it joins the bind-mount set - # alongside --skills (RO) and the traces dir (RW). - if sandbox_dir is not None: - sandbox_cwd = str(Path(sandbox_dir).resolve()) - else: - default_sandbox = workdir_p / "sandbox" - default_sandbox.mkdir(parents=True, exist_ok=True) - sandbox_cwd = str(default_sandbox) - - tasks = read_tasks_txt(tasks_file) - if not tasks: - raise click.UsageError(f"no tasks found in {tasks_file}") - - def _on_event(event: str, **kw): - click.echo(f" [{event}] " + " ".join(f"{k}={v}" for k, v in kw.items()), err=True) - - # Bind on 0.0.0.0 so the podman container (which has its own netns) - # can dial in via ``host.containers.internal``. Loopback would be - # unreachable from the container side. - with serve_in_thread(upstream, captures, host="0.0.0.0") as proxy: - proxy_url = f"http://host.containers.internal:{proxy.port}/v1" - click.echo(f" [proxy] {proxy_url}", err=True) - - sandbox_env = { - "AGENTCAP_PROXY_URL": proxy_url, - "AGENTCAP_MODEL": model, - "AGENTCAP_PROVIDER": provider_slug, - "AGENTCAP_TRACES_DIR": str(traces.resolve()), - # State dir: SQLite-backed agents (hermes, goose, opencode) - # redirect their session store at it, so the .db lands on - # host as it's written — survives container crashes. Pi - # streams JSONL via the traces symlink and ignores this. - "AGENTCAP_STATE_DIR": str(state.resolve()), - } - if api_key: - sandbox_env["AGENTCAP_API_KEY"] = api_key - sandbox_ro: list[Path] = [] - if skills_dir is not None: - skills_abs = Path(skills_dir).resolve() - sandbox_env["AGENTCAP_SKILLS_DIR"] = str(skills_abs) - sandbox_ro.append(skills_abs) - sandbox_rw: list[Path] = [ - traces.resolve(), - state.resolve(), - Path(sandbox_cwd).resolve(), - ] - # First call per agent builds/boots the image; can take minutes. - sandbox = require_sandbox_or_die( - agent=agent, command="agentcap run", log=_sb_log, - env=sandbox_env, - readonly_paths=sandbox_ro, - writable_paths=sandbox_rw, - ) - - driver_kwargs: dict = { - "sandbox": sandbox, "cwd": sandbox_cwd, "model": model, - } - driver = get_driver(agent, **driver_kwargs) - - click.echo( - f"agentcap run: {len(tasks)} tasks × {turns} turns through " - f"{agent} -> {upstream}", - err=True, - ) - orch = Orchestrator( - driver, fu, sessions_dir=sessions, on_event=_on_event, - set_capture_context=proxy.set_context, - ) - - try: - results = orch.run_corpus( - tasks, turns_per_task=turns, timeout=timeout, - ) - finally: - # Dump SQLite-stored sessions to AGENTCAP_TRACES_DIR for - # agents whose images ship a ``dump-traces`` script - # (goose, opencode). No-op for symlink-style agents - # (pi, hermes) — their transcripts already streamed to - # the host. Failure is logged but never aborts the run. - dump_argv = traces_dump_argv_for(agent) - if dump_argv is not None: - try: - r = sandbox.run( - dump_argv, - env=sandbox_env, - cwd=sandbox_cwd, - timeout=600, - ) - if r.returncode != 0: - click.echo( - f" [traces] dump-traces rc={r.returncode}", - err=True, - ) - except Exception as exc: - click.echo(f" [traces] dump-traces failed: {exc}", err=True) - close = getattr(driver, "close", None) - if callable(close): - close() - sb_close = getattr(sandbox, "close", None) - if callable(sb_close): - sb_close() - - summary = { - "agent": agent, - "model": model, - "provider": provider_slug, - "upstream": upstream, - "turns_per_task": turns, - "followup": followup, - "tasks": [ - { - "task_id": r.task_id, - "prompt": r.prompt, - "session_id": r.session_id, - "completed_turns": r.completed_turns, - "turns": [ - { - "turn": t.turn, - "returncode": t.returncode, - "duration_s": round(t.duration_s, 3), - } - for t in r.turns - ], - } - for r in results - ], - } - (workdir_p / "run.json").write_text(json.dumps(summary, indent=2)) - n_ok = sum(1 for r in results if r.completed_turns == turns) - click.echo( - f"agentcap run: {n_ok}/{len(results)} tasks completed all {turns} turns; " - f"summary -> {workdir_p / 'run.json'}", - err=True, - ) - - -def _scan_run_dirs( - run_dirs: list[Path], - *, - no_verification: bool = False, - rescan: bool = False, -) -> int: - """Run trufflehog over each run dir; print a per-run summary. - Returns the total count of **verified** hits across all runs. - Unverified hits are listed but never abort the caller — - Trufflehog's pattern matchers have a real false-positive rate. - - Persists results to ``/scan.json`` so repeat scans skip - the verification round-trips. Pass ``rescan=True`` to force a - fresh scan.""" - from collections import Counter - - from .scan import TrufflehogMissingError, scan_run_dir - - total_verified = 0 - for run_dir in run_dirs: - try: - result, was_cached = scan_run_dir( - run_dir, - no_verification=no_verification, - rescan=rescan, - ) - except TrufflehogMissingError as exc: - raise click.ClickException(str(exc)) - n_unver = len(result.unverified) - n_ver = len(result.verified) - total_verified += n_ver - cache_tag = " (cached)" if was_cached else "" - click.echo( - f" [scan] {run_dir.name}{cache_tag}: " - f"{result.chunks_scanned} chunks / {result.bytes_scanned} bytes; " - f"verified={n_ver} unverified={n_unver}", - err=True, - ) - # Verified hits are rare + actionable — list each one. - for hit in result.verified: - click.echo( - f" VERIFIED {hit.detector} {hit.file}", - err=True, - ) - # Unverified hits are usually pattern-only false positives - # (Box matches any 32-char alphanumeric, Mailgun any 32-hex, - # …). Summarise by detector instead of dumping every line; - # per-hit detail lives in ``/scan.json``. - if result.unverified: - by_det = Counter(h.detector for h in result.unverified) - tail = ", ".join( - f"{det}={n}" for det, n in by_det.most_common() - ) - click.echo(f" unverified by detector: {tail}", err=True) - return total_verified - - -@cli.command("ls") -@click.argument( - "workspace", - required=False, - type=click.Path(file_okay=False, dir_okay=True, resolve_path=False), -) -@click.option( - "--long", "-l", "long_form", is_flag=True, - help="Long form: include upstream and per-run task counts.", -) -def ls_cmd(workspace: str | None, long_form: bool) -> None: - """List runs under a local workspace. - - Without ``WORKSPACE``, looks at ``./.agentcap/``. Accepts either - the parent dir (where ``agentcap run`` created the ``.agentcap/`` - subdir) or the ``.agentcap/`` dir itself. - - Unlike ``agentcap run`` / ``export``, ``ls`` does NOT consult - ``$AGENTCAP_WORKSPACE`` — what you point it at is what you get. - """ - import json as _json - - if workspace is None: - root = Path.cwd() / _WORKSPACE_DIR - else: - # Normalize before checking .name so paths like ``.``, - # ``.agentcap/.`` or ``foo/`` classify correctly (``Path('.').name`` - # is ``''``, not ``'.agentcap'``). - p = Path(os.path.normpath(workspace)).absolute() - root = p if p.name == _WORKSPACE_DIR else p / _WORKSPACE_DIR - if not root.is_dir(): - click.echo( - f"no workspace at {str(root)!r}. " - f"Run `agentcap run` first, or pass a directory that " - f"contains a ``.agentcap/`` subdir.", - err=True, - ) - return - - rows: list[dict] = [] - for run_dir in sorted(root.iterdir()): - meta_path = run_dir / "run.json" - if not run_dir.is_dir() or not meta_path.is_file(): - continue - try: - meta = _json.loads(meta_path.read_text()) - except (OSError, _json.JSONDecodeError): - continue - captures = run_dir / "captures" - n_caps = ( - len(list(captures.glob("*.request.json"))) if captures.is_dir() else 0 - ) - tasks = meta.get("tasks") or [] - turns = meta.get("turns_per_task", 1) - n_ok = sum(1 for t in tasks if t.get("completed_turns") == turns) - rows.append({ - "run_id": run_dir.name, - "agent": meta.get("agent") or "?", - "model": (meta.get("model") or "?").split("/")[-1], - "provider": meta.get("provider") or "?", - "upstream": meta.get("upstream") or "?", - "n_tasks": len(tasks), - "n_ok": n_ok, - "n_caps": n_caps, - }) - - if not rows: - click.echo(f"no runs in {root}.", err=True) - return - - if long_form: - cols = ["run_id", "agent", "model", "provider", "tasks", "captures", "upstream"] - widths = [ - max(len("run_id"), max(len(r["run_id"]) for r in rows)), - max(len("agent"), max(len(r["agent"]) for r in rows)), - max(len("model"), max(len(r["model"]) for r in rows)), - max(len("provider"), max(len(r["provider"]) for r in rows)), - len("tasks"), - len("captures"), - max(len("upstream"), max(len(r["upstream"]) for r in rows)), - ] - else: - cols = ["run_id", "agent", "model", "tasks", "captures"] - widths = [ - max(len("run_id"), max(len(r["run_id"]) for r in rows)), - max(len("agent"), max(len(r["agent"]) for r in rows)), - max(len("model"), max(len(r["model"]) for r in rows)), - len("tasks"), - len("captures"), - ] - - def _fmt(cells: list[str]) -> str: - return " ".join(c.ljust(w) for c, w in zip(cells, widths)) - - click.echo(_fmt([c.upper() for c in cols])) - for r in rows: - tasks_cell = f"{r['n_ok']}/{r['n_tasks']}" - if long_form: - click.echo(_fmt([ - r["run_id"], r["agent"], r["model"], r["provider"], - tasks_cell, str(r["n_caps"]), r["upstream"], - ])) - else: - click.echo(_fmt([ - r["run_id"], r["agent"], r["model"], - tasks_cell, str(r["n_caps"]), - ])) - - -def _resolve_request_id( - rid: str, source: str | None, *, workspace: Path | None = None, -) -> tuple[str, dict, dict | None, dict | None, Path | None]: - """Resolve ``rid`` (full or short prefix) to - ``(full_rid, body, response_record, request_record, capture_dir)``. - - - If ``source`` is given, looks the rid up there via - ``captures.load_request`` (any agentcap-supported source: dir, - parquet, hf://) — exact match only. Response and request - records and ``capture_dir`` are unavailable in that path - (just the body). - - Otherwise scans ``workspace`` (defaults to ``_workspace_root()`` - for legacy ``run`` / ``export`` callers; ``inspect`` - passes cwd explicitly), accepting a prefix (git-style) and - returning the body, the paired response, the full request - record (which carries ``task_id``, ``turn``, ``captured_at``, - ``upstream_url``), and the capture dir the rid was found in. - """ - from . import captures - - if source is not None: - try: - return rid, captures.load_request(source, rid), None, None, None - except KeyError as exc: - raise click.UsageError(str(exc)) - except (ValueError, FileNotFoundError) as exc: - raise click.UsageError(str(exc)) - - if workspace is None: - workspace = _workspace_root() - try: - found = captures.resolve_workspace_rid(workspace, rid) - except captures.AmbiguousRequestId as exc: - raise click.UsageError(str(exc)) - if found is None: - raise click.UsageError( - f"request_id {rid!r} not found in workspace at {workspace}; " - f"pass a different TARGET (a dir, .parquet, or hf:// URI)." - ) - capture_dir, full_rid = found - import json as _json - req_rec = _json.loads( - (capture_dir / f"{full_rid}.request.json").read_text() - ) - resp_path = capture_dir / f"{full_rid}.response.json" - resp_rec = ( - _json.loads(resp_path.read_text()) if resp_path.is_file() else None - ) - body = req_rec.get("body") - if not isinstance(body, dict): - raise click.UsageError( - f"capture {capture_dir / f'{full_rid}.request.json'} has no body field" - ) - return full_rid, body, resp_rec, req_rec, capture_dir - - -def _enumerate_workspace_requests( - scope: str | None, *, workspace: Path | None = None, -) -> list[dict]: - """Walk captures across the workspace (or one run if ``scope`` is a - run-id) and return one row per captured request, grouped by run - then chronological within each run. Each row has ``run_id``, - ``rid``, ``captured_at``, ``status``, and ``preview`` (last user - message, truncated). ``workspace`` defaults to ``_workspace_root()`` - so legacy callers don't break; ``inspect`` passes it - explicitly from the resolved TARGET.""" - import json as _json - - root = workspace if workspace is not None else _workspace_root() - if not root.is_dir(): - return [] - run_dirs = ( - [root / scope] if scope else [d for d in sorted(root.iterdir()) if d.is_dir()] - ) - rows: list[dict] = [] - for run_dir in run_dirs: - captures = run_dir / "captures" - if not captures.is_dir(): - continue - # Sort within (task, time) so per-task ``prev_rid`` is the - # immediately-preceding capture in chronological order. - recs: list[tuple[str, dict]] = [] - for req_path in captures.glob("*.request.json"): - rid = req_path.stem.split(".")[0] - try: - req = _json.loads(req_path.read_text()) - except (OSError, _json.JSONDecodeError): - continue - recs.append((rid, req)) - recs.sort( - key=lambda r: (r[1].get("task_id") or "", r[1].get("captured_at", 0)) - ) - prev_rid_by_task: dict = {} - prev_msgs_by_task: dict = {} - idx_by_task: dict = {} - for rid, req in recs: - resp_path = captures / f"{rid}.response.json" - status = "?" - if resp_path.is_file(): - try: - status = str(_json.loads(resp_path.read_text()).get("status_code", "?")) - except (OSError, _json.JSONDecodeError): - pass - messages = (req.get("body") or {}).get("messages") or [] - task_id = req.get("task_id") - # When task_id is missing, key the per-task caches on the - # rid so unrelated orphan captures don't accidentally chain - # together for the diff / prev_rid / req_index. - task_key = task_id if task_id is not None else rid - prev_msgs = prev_msgs_by_task.get(task_key) - if prev_msgs is None: - new_msgs = messages - label = f"(init {len(new_msgs)})" - else: - removed, new_msgs = _diff_messages(prev_msgs, messages) - label = f"({_delta_label(len(removed), len(new_msgs))})" - summary = _message_summary(new_msgs[-1]) if new_msgs else "" - preview = f"{label} {summary}".replace("\n", " ").strip() - # Concatenate every new message's content into a single - # searchable blob so fzf can match against deeper content - # (e.g. ``hf-cli`` referenced 4 messages back in the diff) - # without bloating the visible row. - searchable = " ".join( - _message_text(m) for m in new_msgs - ).replace("\n", " ").replace("\t", " ") - prev_rid = prev_rid_by_task.get(task_key) - prev_msgs_by_task[task_key] = messages - prev_rid_by_task[task_key] = rid - idx_by_task[task_key] = idx_by_task.get(task_key, 0) + 1 - rows.append({ - "run_id": run_dir.name, - "rid": rid, - "captured_at": int(req.get("captured_at", 0)), - "status": status, - "task_id": task_id, - "turn": req.get("turn"), - "req_index": idx_by_task[task_key], - "prev_rid": prev_rid, - "preview": preview, - "searchable": searchable, - }) - rows.sort(key=lambda r: (r["run_id"], r["captured_at"])) - return rows - - -def _enumerate_parquet_requests(parquet_path: Path) -> list[dict]: - """Same row shape as ``_enumerate_workspace_requests`` but sourced - from a single ``-captures`` parquet (``agentcap export`` output). - Newer parquets carry ``task_id`` / ``turn`` so the diff / prev_rid - chain groups per (run, task); older ones without those columns - fall back to one linear chain per ``run_id`` and the LOC cell - just stays ``-``.""" - import json as _json - import pyarrow.parquet as pq - - table_meta = pq.ParquetFile(str(parquet_path)).schema_arrow - available = set(table_meta.names) - cols = ["request_id", "captured_at", "request", "response", "run_id"] - has_task = "task_id" in available - has_turn = "turn" in available - if has_task: - cols.append("task_id") - if has_turn: - cols.append("turn") - t = pq.read_table(str(parquet_path), columns=cols) - n = t.num_rows - if n == 0: - return [] - rids = t.column("request_id").to_pylist() - times = t.column("captured_at").to_pylist() - reqs = t.column("request").to_pylist() - resps = t.column("response").to_pylist() - runs = t.column("run_id").to_pylist() - task_ids = t.column("task_id").to_pylist() if has_task else [None] * n - turns = t.column("turn").to_pylist() if has_turn else [None] * n - - order = sorted(range(n), key=lambda i: (runs[i] or "", int(times[i] or 0))) - rows: list[dict] = [] - prev_msgs: dict = {} - prev_rid: dict = {} - idx_by_key: dict = {} - # Drop rows whose request_id isn't the proxy's 32-hex format. - # The picker would reject them anyway (``_pick_parquet_request`` - # validates via the same regex) and they get interpolated into - # the fzf preview shell command via ``{2}`` / ``{3}`` — keeping - # them out at enumeration time also closes the door on any - # injection vector from a malformed parquet. - import re - _hex_rid = re.compile(r"[0-9a-f]{32}") - for i in order: - rid = rids[i] - if not rid or not _hex_rid.fullmatch(rid): - continue - try: - body = _json.loads(reqs[i] or "{}") - except _json.JSONDecodeError: - body = {} - messages = body.get("messages") or [] - run_id = runs[i] or "?" - task_id = task_ids[i] - # Mirror workspace semantics: group prev/diff by (run, task); - # fall back to (run, rid) when task_id is missing so unrelated - # rows don't chain into one synthetic task. - key = (run_id, task_id if task_id is not None else rid) - prior = prev_msgs.get(key) - if prior is None: - new_msgs = messages - label = f"(init {len(new_msgs)})" - else: - removed, new_msgs = _diff_messages(prior, messages) - label = f"({_delta_label(len(removed), len(new_msgs))})" - summary = _message_summary(new_msgs[-1]) if new_msgs else "" - preview = f"{label} {summary}".replace("\n", " ").strip() - searchable = " ".join( - _message_text(m) for m in new_msgs - ).replace("\n", " ").replace("\t", " ") - status = "?" - try: - status = str(_json.loads(resps[i] or "{}").get("status_code", "?")) - except _json.JSONDecodeError: - pass - idx_by_key[key] = idx_by_key.get(key, 0) + 1 - rows.append({ - "run_id": run_id, - "rid": rid, - "captured_at": int(times[i] or 0), - "status": status, - "task_id": task_id, - "turn": turns[i], - "req_index": idx_by_key[key], - "prev_rid": prev_rid.get(key), - "preview": preview, - "searchable": searchable, - }) - prev_msgs[key] = messages - prev_rid[key] = rid - return rows - - -def _format_inspect_rows(rows: list[dict]) -> tuple[str, list[str]]: - """Flat table: one row per captured call. Columns are LOC - (``task_id.``), RID, RUN (shown only when rows span - multiple runs — redundant otherwise), MESSAGES (``(+N)`` / - ``(init N)`` / ``(-X +Y)`` delta + one-line role-aware summary). - Time / status / model / size live in the fzf preview pane. - - Returns ``(header, fzf_lines)``. Each fzf line is the visible - content followed by tab-delimited hidden columns the preview - command pulls via ``{2}`` / ``{3}`` (full rid, previous rid) plus - a searchable blob fzf matches against (column 4).""" - include_run = len({r.get("run_id") for r in rows}) > 1 - - rid_w = 8 - loc_w = max( - len("LOC"), - max(( - len(f"{r.get('task_id') or '?'}.{r.get('req_index')}") - if r.get("task_id") and r.get("req_index") is not None else 1 - for r in rows - ), default=0), - ) - run_w = ( - max(len("RUN"), max((len(r["run_id"]) for r in rows), default=0)) - if include_run else 0 - ) - - def _row(loc, rid, run, prompt) -> str: - cells = [f"{loc:<{loc_w}}", f"{rid:<{rid_w}}"] - if include_run: - cells.append(f"{run:<{run_w}}") - cells.append(prompt) - return " ".join(cells) - - header = _row("LOC", "RID", "RUN", "MESSAGES") - - fzf: list[str] = [] - prev_task: str | None = None - for r in rows: - loc = ( - f"{r.get('task_id') or '?'}.{r.get('req_index')}" - if r.get("task_id") and r.get("req_index") is not None - else "-" - ) - # Strip tabs from the visible content so they don't shift the - # tab-delimited hidden columns appended below. - line = _row(loc, r["rid"][:8], r["run_id"], r["preview"]).replace("\t", " ") - task_id = r.get("task_id") - if task_id and task_id != prev_task: - # Reverse video: inverts fg/bg so the row pops on any - # terminal palette regardless of theme. - line = f"\033[7m{line}\033[0m" - prev_task = task_id - # Hidden tab columns (fzf searches all of them by default): - # 2 = full rid, 3 = prev rid, 4 = concatenated new-message - # bodies so a query like ``hf-cli`` matches rows whose deeper - # content references it. - fzf.append( - f"{line}\t{r['rid']}\t{r.get('prev_rid') or '-'}" - f"\t{r.get('searchable') or ''}" - ) - return header, fzf - - -def _fzf_pick( - header: str | None, - lines: list[str], - preview_cmd: str, - *, - extra_args: Sequence[str] = (), -) -> str | None: - """Run fzf over ``lines``. Returns the selected line, or ``None`` - if the user cancelled (Esc / Ctrl-C). - - ``header=None`` means the first element of ``lines`` is the header - (passed to fzf via ``--header-lines=1`` so it stays in lockstep - with the body on reload — needed when the column widths grow as - background fetches land). Otherwise ``--header=
`` pins - a static line above the body.""" - import shutil - import subprocess - - if not shutil.which("fzf"): - raise click.UsageError( - "fzf is required for interactive pickers " - "(install via 'brew install fzf' or your distro's package manager)." - ) - - args = [ - "fzf", - "--ansi", - "--layout=reverse", - "--header-first", - "--preview", preview_cmd, - "--preview-window=right:60%:wrap", - "--no-sort", - ] - if header is None: - args += ["--header-lines=1"] - else: - args += ["--header", header] - args.extend(extra_args) - proc = subprocess.run( - args, - input="\n".join(lines), - capture_output=True, - text=True, - ) - if proc.returncode != 0: - return None - return proc.stdout.rstrip("\n") or None - - -def _pick_workspace_run(*, workspace: Path | None = None) -> str | None: - """Open an fzf picker over the runs in the workspace, returning the - selected run-id, or ``None`` if cancelled. fzf is a hard - requirement of ``inspect``; the gate lives at the top of - ``inspect_cmd``. ``workspace`` defaults to ``_workspace_root()``; - ``inspect`` passes cwd or the resolved dir explicitly.""" - import json as _json - import sys - - root = workspace if workspace is not None else _workspace_root() - if not root.is_dir(): - raise click.UsageError(f"no workspace at {root}") - - rows: list[dict] = [] - for run_dir in sorted(root.iterdir()): - meta_path = run_dir / "run.json" - if not run_dir.is_dir() or not meta_path.is_file(): - continue - try: - meta = _json.loads(meta_path.read_text()) - except (OSError, _json.JSONDecodeError): - continue - captures = run_dir / "captures" - n_caps = ( - len(list(captures.glob("*.request.json"))) if captures.is_dir() else 0 - ) - if n_caps == 0: - continue # skip empty runs — nothing to inspect - tasks = meta.get("tasks") or [] - rows.append({ - "run_id": run_dir.name, - "agent": meta.get("agent") or "?", - "model": (meta.get("model") or "?").split("/")[-1], - "n_tasks": len(tasks), - "n_caps": n_caps, - }) - if not rows: - raise click.UsageError(f"no runs with captures in {root}") - - agent_w = max(len("AGENT"), max(len(r["agent"]) for r in rows)) - model_w = max(len("MODEL"), max(len(r["model"]) for r in rows)) - - def _row(agent, model, tasks, caps) -> str: - return ( - f"{agent:<{agent_w}} {model:<{model_w}} " - f"{tasks:>5} {caps:>4}" - ) - - header = _row("AGENT", "MODEL", "TASKS", "CAPS") - # Tab-delim hidden col 2 carries the run_id — picker shells out to - # ``_run_preview`` with it, and we extract it from the picked line - # below. Visible layout matches the HF parquet picker's terser - # ``AGENT MODEL …`` style. - lines = [ - _row(r["agent"], r["model"], str(r["n_tasks"]), str(r["n_caps"])) - + f"\t{r['run_id']}" - for r in rows - ] - import shlex - ws_arg = f"--workspace {shlex.quote(str(root))}" - preview = ( - f"{sys.executable} -m agentcap _run_preview {ws_arg} {{2}}" - f" 2>/dev/null | head -200" - ) - picked = _fzf_pick( - header, lines, preview, - extra_args=["--delimiter", "\t", "--with-nth", "1"], - ) - if picked is None: - return None - fields = picked.rsplit("\t", 1) - return fields[1].strip() if len(fields) == 2 else None - - -def _pick_workspace_request( - scope: str | None, *, initial_short_rid: str | None = None, - workspace: Path | None = None, -) -> str | None: - """fzf picker for a workspace request. Returns the picked short - rid, or ``None`` if cancelled. fzf is a hard requirement of - ``inspect``; the gate lives at the top of ``inspect_cmd``. - - ``initial_short_rid`` (if given) positions the cursor on the row - whose rid starts with that prefix when the picker opens — used - when re-entering the picker from the message sub-picker so the - user lands back where they were. ``workspace`` defaults to - ``_workspace_root()``; ``inspect`` passes it - explicitly from the resolved TARGET.""" - import shlex - import sys - - if workspace is None: - workspace = _workspace_root() - rows = _enumerate_workspace_requests(scope, workspace=workspace) - if not rows: - where = f"run {scope!r}" if scope else "workspace" - raise click.UsageError(f"no captured requests in {where}") - - header, fzf_lines = _format_inspect_rows(rows) - # Tab-delim hidden columns: 2 = full rid, 3 = previous-capture rid - # (or "-" for the first capture of a task). Pre-computing the prev - # rid here lets the preview pane skip a full cap-dir rescan per - # fzf hover. ``_highlight`` wraps each occurrence of fzf's current - # query (``{q}``) in red so the user can see where the match - # landed inside the preview. ``{q}`` is its own positional arg so - # fzf's automatic shell-escaping handles quoting end-to-end. - ws_arg = f"--workspace {shlex.quote(str(workspace))}" - preview = ( - f"{sys.executable} -m agentcap _preview {ws_arg} {{2}} {{3}}" - f" 2>/dev/null | head -400" - f" | {sys.executable} -m agentcap _highlight {{q}}" - ) - - extra = [ - "--delimiter", "\t", "--with-nth", "1", - "--no-hscroll", - "--bind", "change:refresh-preview", - ] - if initial_short_rid: - for i, line in enumerate(fzf_lines, start=1): - parts = line.split("\t") - # Hidden column 2 carries the full rid; match by prefix. - if len(parts) >= 2 and parts[1].startswith(initial_short_rid): - # ``load`` fires after fzf finishes reading stdin so - # the items exist when ``pos(N)`` runs (``start`` is - # too early — fires before items are loaded). - extra.extend(["--bind", f"load:pos({i})"]) - break - - picked = _fzf_pick( - header, fzf_lines, preview, - extra_args=extra, - ) - if picked is None: - return None # cancelled - # picked is the visible (column-1) line; RID is the second - # whitespace-separated field on it. - tokens = picked.split() - short = tokens[1] if len(tokens) >= 2 else "" - import re - if not re.fullmatch(r"[0-9a-f]{8}", short): - return None - return short - - -def _classify_target(target: str | None) -> tuple[str, object]: - """Classify the ``TARGET`` positional of ``inspect``. - - Returns ``(kind, payload)``: - - ``("workspace", Path)`` — local ``.agentcap`` dir to browse. - - ``("workspace-run", run_id)`` — scope to one run under cwd's - ``.agentcap`` (``run_id`` is the dir name). - - ``("rid", rid)`` — body dump; rid looked up in cwd's workspace. - - ``("parquet", Path)`` — local ``.parquet`` file. - - ``("hf", "/")`` — HF dataset of captures. - - Detection is content-based: ``/`` is treated as HF - only when no local directory by that name exists, so a relative - path like ``./my-org/my-data`` (or ``my-org/my-data`` when it - exists as a dir) wins over the HF interpretation. Run-id and rid - are inferred from shape + existence under cwd's ``.agentcap``.""" - import re - if target is None: - return "workspace", Path.cwd() / _WORKSPACE_DIR - - if target.endswith(".parquet"): - p = Path(target) - if not p.is_file(): - raise click.UsageError(f"parquet not found: {target}") - return "parquet", p - - if target.startswith("hf://"): - s = target.removeprefix("hf://datasets/").removeprefix("hf://").strip("/") - if s.count("/") == 1 and all(s.split("/")): - return "hf", s - raise click.UsageError(f"invalid hf URI: {target!r}") - - # Local directory → workspace (accept either parent or .agentcap). - # Normalize first so ``.`` / ``/.agentcap/.`` / trailing-slash - # forms classify correctly (``Path('.').name`` is ``''``, not - # ``'.agentcap'``). - if Path(target).is_dir(): - p = Path(os.path.normpath(target)).absolute() - ws = p if p.name == _WORKSPACE_DIR else p / _WORKSPACE_DIR - return "workspace", ws - - # Run-id under cwd's .agentcap (run dirs always carry a timestamp, - # so they reliably contain a dash). - cwd_ws = Path.cwd() / _WORKSPACE_DIR - if "-" in target and (cwd_ws / target / "run.json").is_file(): - return "workspace-run", target - - # ``/`` HF shorthand — only when it's not a local path. - if target.count("/") == 1 and all(target.split("/")): - return "hf", target - - # All-hex string → request-id (looked up in cwd workspace). - if re.fullmatch(r"[0-9a-f]+", target) and len(target) >= 6: - return "rid", target - - raise click.UsageError( - f"can't classify TARGET {target!r}: expected a directory, " - f"a .parquet file, an hf:// URI, an / shorthand, " - f"a run-id (under ./.agentcap/), or a request-id (hex)." - ) - - -@functools.lru_cache(maxsize=1) -def _hf_filesystem(): - """Authenticated, process-wide ``HfFileSystem``.""" - from huggingface_hub import HfFileSystem, get_token - return HfFileSystem(token=get_token()) - - -def _fetch_hf_parquet_meta( - repo_id: str, path: str, *, - revision: str | None = None, - kv_only: bool = False, -) -> dict: - """Returns ``{agent, model, num_rows, tasks: [{id, turns, prompt}]}``. - ``kv_only=True`` skips the row-group reads — return value has no - ``tasks`` key in that case (the preview cmd uses its presence to - distinguish a partial write from "no task_id schema").""" - import json as _json - import pyarrow.parquet as pq - from huggingface_hub import try_to_load_from_cache - out: dict = {"agent": None, "model": None, "num_rows": 0} - - opener = None - if revision: - local = try_to_load_from_cache( - repo_id=repo_id, filename=path, - repo_type="dataset", revision=revision, - ) - if isinstance(local, str) and Path(local).is_file(): - opener = open(local, "rb") - if opener is None: - opener = _hf_filesystem().open(f"datasets/{repo_id}/{path}", "rb") - - with opener as fh: - pf = pq.ParquetFile(fh) - out["num_rows"] = pf.metadata.num_rows - # Schema-level KV metadata: ``export_local`` stamps ``agent`` - # / ``model`` / ``tasks`` here. Bytes-keyed; ``None`` when - # missing. - schema_md = pf.schema_arrow.metadata or {} - for key in ("agent", "model"): - v = schema_md.get(key.encode()) - if v: - out[key] = v.decode("utf-8", errors="replace") - tasks_raw = schema_md.get(b"tasks") - if tasks_raw: - try: - out["tasks"] = _json.loads(tasks_raw.decode("utf-8")) - return out # KV has the full preview slice — no row-group read needed - except _json.JSONDecodeError: - pass - if kv_only: - return out - # Legacy fallback for parquets exported before tasks landed in KV. - out["tasks"] = [] - cols = pf.schema_arrow.names - if "task_id" in cols and pf.num_row_groups: - # Row group 0 sample only — tasks in later row groups - # don't show up in the preview. - rg_cols = ["task_id"] - if "turn" in cols: - rg_cols.append("turn") - if "request" in cols: - rg_cols.append("request") - rg = pf.read_row_group(0, columns=rg_cols) - tids = rg.column("task_id").to_pylist() - turns = ( - rg.column("turn").to_pylist() - if "turn" in rg_cols else [None] * len(tids) - ) - raws = ( - rg.column("request").to_pylist() - if "request" in rg_cols else [None] * len(tids) - ) - per_task: dict[str, dict] = {} - for tid, t, raw in zip(tids, turns, raws): - if not tid: - continue - d = per_task.setdefault(tid, {"turns": 0, "prompt": None}) - if t is not None and int(t) > d["turns"]: - d["turns"] = int(t) - if d["prompt"] is None and raw: - try: - msgs = (_json.loads(raw) or {}).get("messages") or [] - except (_json.JSONDecodeError, ValueError, TypeError): - msgs = [] - for m in msgs: - if m.get("role") == "user": - d["prompt"] = _message_text(m).replace("\n", " ") - break - out["tasks"] = [ - {"id": tid, "turns": per_task[tid]["turns"], - "prompt": per_task[tid]["prompt"]} - for tid in sorted(per_task) - ] - return out - - -def _hf_list_parquets(repo_id: str) -> list[dict]: - """``.parquet`` files in ``/`` as ``[{path, size}, ...]``, - sorted by path. Per-parquet metadata is hydrated later from each - parquet's footer.""" - from huggingface_hub import HfApi - api = HfApi() - tree = api.list_repo_tree(repo_id, repo_type="dataset", recursive=True) - base: list[dict] = [] - for entry in tree: - path = getattr(entry, "path", None) or getattr(entry, "rfilename", None) - if not path or not path.endswith(".parquet"): - continue - size = getattr(entry, "size", None) or 0 - base.append({"path": path, "size": int(size)}) - base.sort(key=lambda r: r["path"]) - return base - - -def _hf_meta_tempfile(tempdir: Path, path: str) -> Path: - """SHA-1 prefix avoids collisions across HF paths.""" - import hashlib - digest = hashlib.sha1(path.encode()).hexdigest()[:16] - return tempdir / f"{Path(path).stem}-{digest}.json" - - -def _write_meta_atomic(target: Path, meta: dict) -> None: - """Atomic write via .tmp + rename so the fzf preview cmd never - sees a half-written file.""" - import json as _json - tmp = target.with_suffix(target.suffix + ".tmp") - tmp.write_text(_json.dumps(meta, ensure_ascii=False)) - tmp.replace(target) - - -_HF_PREVIEW_TASK_LIMIT = 15 - - -@cli.command("_hf_parquet_preview", hidden=True) -@click.option( - "--tempdir", "tempdir_str", required=True, - type=click.Path(file_okay=False, dir_okay=True), - help="Session tempdir populated by the prefetch subprocess.", -) -@click.argument("path") -def _hf_parquet_preview_cmd(tempdir_str: str, path: str) -> None: - """Render the fzf preview pane from the parquet's tempfile. - Exits immediately if not yet on disk; the prefetch subprocess - POSTs ``refresh-preview`` to re-invoke us once it lands.""" - import json as _json - tempdir = Path(tempdir_str) - target = _hf_meta_tempfile(tempdir, path) - - click.echo(f"path: {path}") - if not target.is_file(): - click.echo("loading…") - return - - try: - meta = _json.loads(target.read_text()) - except (OSError, _json.JSONDecodeError) as exc: - click.echo(f"(preview failed: {type(exc).__name__}: {exc})") - return - click.echo(f"agent: {meta.get('agent') or '?'}") - click.echo(f"model: {meta.get('model') or '?'}") - click.echo(f"rows: {meta.get('num_rows', 0):,}") - if "tasks" not in meta: - click.echo("tasks: …") - return - tasks = meta["tasks"] - click.echo(f"tasks: {len(tasks)}") - if not tasks: - click.echo() - click.echo("(no task_id column — pre-schema-upgrade parquet)") - return - click.echo() - click.echo("─── TASKS ───") - shown = tasks[:_HF_PREVIEW_TASK_LIMIT] - for t in shown: - prompt = _flatten(t.get("prompt") or "(no user message)", 120) - click.echo(f" {t['id']}: ({t.get('turns', 0)} turns) {prompt}") - hidden = len(tasks) - len(shown) - if hidden > 0: - click.echo(f" … and {hidden} more") - - -def _short_model(model: str | None) -> str: - """Strip the ``org/`` prefix for display, matching the local run - picker's layout.""" - return model.rsplit("/", 1)[-1] if model else "..." - - -def _picker_rows(tempdir: Path, entries: list[dict]) -> list[str]: - """Build the picker's header + row list from current tempdir state. - Returns ``[header, *body]``. Columns ``AGENT MODEL TASKS CAPS`` - mirror the local run picker. ``TASKS`` shows ``?`` until Pass B - writes the task list for that row; widths are dynamic so layout - stays tight as KV / task counts land via fzf reload.""" - import json as _json - loaded: list[tuple[str | None, str | None, str, str, str]] = [] - for entry in entries: - path = entry["path"] - agent = model = None - n_tasks = "?" - n_caps = "?" - tmpfile = _hf_meta_tempfile(tempdir, path) - if tmpfile.is_file(): - try: - meta = _json.loads(tmpfile.read_text()) - agent = meta.get("agent") - model = meta.get("model") - if meta.get("num_rows") is not None: - n_caps = str(meta["num_rows"]) - if "tasks" in meta: - n_tasks = str(len(meta["tasks"])) - except (OSError, _json.JSONDecodeError): - pass - loaded.append((agent, model, n_tasks, n_caps, path)) - - def _w(label: str, fn) -> int: - return max(len(label), *(len(fn(r)) for r in loaded)) if loaded else len(label) - - agent_w = _w("AGENT", lambda r: r[0] or "...") - model_w = _w("MODEL", lambda r: _short_model(r[1])) - tasks_w = _w("TASKS", lambda r: r[2]) - caps_w = _w("CAPS", lambda r: r[3]) - - def _line(agent: str, model: str, tasks: str, caps: str, path: str = "") -> str: - return ( - f"{agent:<{agent_w}} {model:<{model_w}} " - f"{tasks:>{tasks_w}} {caps:>{caps_w}}" - + (f"\t{path}" if path else "") - ) - - header = _line("AGENT", "MODEL", "TASKS", "CAPS") - body = [ - _line(a or "...", _short_model(m), t, c, p) - for a, m, t, c, p in loaded - ] - return [header, *body] - - -@cli.command("_hf_picker_list", hidden=True) -@click.option( - "--tempdir", "tempdir_str", required=True, - type=click.Path(file_okay=False, dir_okay=True), -) -@click.option( - "--paths-file", "paths_file", required=True, - type=click.Path(file_okay=True, dir_okay=False), -) -def _hf_picker_list_cmd(tempdir_str: str, paths_file: str) -> None: - """Emit current rows to stdout. fzf's ``reload(...)`` source — - re-invoked after each Pass-A write.""" - import json as _json - try: - entries = _json.loads(Path(paths_file).read_text()) - except (OSError, _json.JSONDecodeError): - return - for line in _picker_rows(Path(tempdir_str), entries): - click.echo(line) - - -@cli.command("_hf_prefetch", hidden=True) -@click.option( - "--tempdir", "tempdir_str", required=True, - type=click.Path(file_okay=False, dir_okay=True), -) -@click.option("--repo", "repo_id", required=True) -@click.option( - "--fzf-port", "fzf_port", type=int, default=None, - help="HTTP port of fzf's --listen server, for refresh-preview " - "after each successful fetch.", -) -@click.option("--revision", "revision", type=str, default=None) -@click.option("--paths-file", "paths_file", type=str, default=None) -def _hf_prefetch_cmd( - tempdir_str: str, repo_id: str, fzf_port: int | None, - revision: str | None, paths_file: str | None, -) -> None: - """Background fetcher: reads paths from stdin, runs Pass A - (parallel KV-only) and Pass B (serial full) concurrently, POSTs - fzf actions after each successful write. SIGKILLed by the picker - when fzf exits.""" - import json as _json - import shlex as _shlex - import sys as _sys - import urllib.error - import urllib.request - tempdir = Path(tempdir_str) - try: - paths = _json.loads(_sys.stdin.read()) - except (OSError, _json.JSONDecodeError): - return - - # fzf invokes this as a shell cmd on each ``reload(...)`` POST. - reload_cmd = ( - f"{_shlex.quote(_sys.executable)} -m agentcap _hf_picker_list" - f" --tempdir={_shlex.quote(tempdir_str)}" - f" --paths-file={_shlex.quote(paths_file or '')}" - ) if paths_file else None - - def _nudge_fzf(body: bytes) -> None: - if fzf_port is None: - return - req = urllib.request.Request( - f"http://127.0.0.1:{fzf_port}/", data=body, method="POST", - ) - try: - with urllib.request.urlopen(req, timeout=0.5) as resp: - resp.read() - except (urllib.error.URLError, OSError): - pass # fzf not up yet, or already exited — harmless - - def _has_tasks(target: Path) -> bool: - """True if target already holds a Pass-B (full) write.""" - if not target.is_file(): - return False - try: - return "tasks" in _json.loads(target.read_text()) - except (OSError, _json.JSONDecodeError): - return False - - # Pass A: KV-only footer reads, 4-way parallel. - def _pass_kv(path: str) -> None: - target = _hf_meta_tempfile(tempdir, path) - if _has_tasks(target): - return # full data already there; don't clobber - try: - meta = _fetch_hf_parquet_meta( - repo_id, path, revision=revision, kv_only=True, - ) - except Exception: # noqa: BLE001 - return - # Re-check: Pass-B may have finished writing the FULL file - # while our network fetch was in flight; overwriting it with - # KV-only would discard tasks and leave the preview stuck. - if _has_tasks(target): - return - try: - _write_meta_atomic(target, meta) - except OSError: - return - if reload_cmd is not None: - _nudge_fzf(f"reload({reload_cmd})".encode()) - - # Pass B: full row-group reads, serial (avoids HF retry storms). - def _pass_full(path: str) -> None: - target = _hf_meta_tempfile(tempdir, path) - if _has_tasks(target): - return - try: - meta = _fetch_hf_parquet_meta( - repo_id, path, revision=revision, kv_only=False, - ) - except Exception: # noqa: BLE001 - return - try: - _write_meta_atomic(target, meta) - except OSError: - return - # ``reload`` so the row's TASKS count refreshes; - # ``refresh-preview`` so the focused row's preview pane picks up - # the new task list. - if reload_cmd is not None: - _nudge_fzf(f"reload({reload_cmd})+refresh-preview".encode()) - else: - _nudge_fzf(b"refresh-preview") - - import threading - from concurrent.futures import ThreadPoolExecutor - - # Passes run concurrently so labels and previews fill in - # independently. - def _run_pass_a() -> None: - with ThreadPoolExecutor(max_workers=4) as pool: - list(pool.map(_pass_kv, paths)) - - a_thread = threading.Thread(target=_run_pass_a, daemon=True) - a_thread.start() - for path in paths: - _pass_full(path) - a_thread.join(timeout=5) - - -def _pick_hf_dataset_parquet( - repo_id: str, tempdir: Path, - rows: list[dict], revision: str | None, -) -> Path | None: - """Pick a parquet from an HF dataset repo. Returns its local - path (via ``hf_hub_download`` after the user selects), or - ``None`` on Esc. ``tempdir`` outlives this call so re-entries - reuse already-fetched tempfiles.""" - import json as _json - import shlex - import socket - import subprocess - import sys - from huggingface_hub import hf_hub_download - - paths_file = tempdir / "paths.json" - paths_file.write_text(_json.dumps( - [{"path": r["path"], "size": r["size"]} for r in rows], - )) - - lines = _picker_rows( - tempdir, [{"path": r["path"], "size": r["size"]} for r in rows], - ) - - # Pre-allocate fzf's --listen port. - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(("127.0.0.1", 0)) - fzf_port = s.getsockname()[1] - - manifest = _json.dumps([r["path"] for r in rows]) - proc = subprocess.Popen( - [sys.executable, "-m", "agentcap", "_hf_prefetch", - "--tempdir", str(tempdir), "--repo", repo_id, - "--fzf-port", str(fzf_port), - "--paths-file", str(paths_file), - *(["--revision", revision] if revision else []), - ], - stdin=subprocess.PIPE, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - try: - assert proc.stdin is not None - proc.stdin.write(manifest.encode()) - proc.stdin.close() - except (BrokenPipeError, OSError): - pass # subprocess died early — we'll detect it via poll() - - preview = ( - f"{sys.executable} -m agentcap _hf_parquet_preview" - f" --tempdir={shlex.quote(str(tempdir))} {{2}} 2>/dev/null" - ) - try: - picked = _fzf_pick( - None, lines, preview, - extra_args=[ - "--delimiter", "\t", "--with-nth", "1", - "--no-hscroll", - f"--listen=127.0.0.1:{fzf_port}", - ], - ) - finally: - if proc.poll() is None: - proc.kill() - try: - proc.wait(timeout=2) - except subprocess.TimeoutExpired: - pass - if picked is None: - return None - rel = picked.rsplit("\t", 1)[-1].strip() - return Path(hf_hub_download( - repo_id=repo_id, repo_type="dataset", filename=rel, - )) - - -def _pick_parquet_request(parquet_path: Path) -> str | None: - """fzf picker over the rows of a captures parquet. Same shape as - ``_pick_workspace_request`` but the preview pipeline shells out to - ``_preview_parquet`` (which reads from the parquet) instead of - ``_preview`` (which scans the workspace). Returns the picked - FULL rid or ``None`` if cancelled. Unlike the workspace flow - (which accepts an 8-char prefix because ``resolve_workspace_rid`` - expands it), the parquet-source path through ``_resolve_request_id`` - does an exact-match lookup, so we must return the full rid.""" - import shlex - import sys - - rows = _enumerate_parquet_requests(parquet_path) - if not rows: - raise click.UsageError(f"no rows in {parquet_path}") - header, fzf_lines = _format_inspect_rows(rows) - pq_quoted = shlex.quote(str(parquet_path)) - preview = ( - f"{sys.executable} -m agentcap _preview_parquet {pq_quoted}" - f" {{2}} {{3}} 2>/dev/null" - f" | head -400" - f" | {sys.executable} -m agentcap _highlight {{q}}" - ) - extra = [ - "--delimiter", "\t", "--with-nth", "1", - "--no-hscroll", - "--bind", "change:refresh-preview", - ] - picked = _fzf_pick(header, fzf_lines, preview, extra_args=extra) - if picked is None: - return None - # Hidden tab-delim column 2 carries the full 32-char rid - # (set by ``_format_inspect_rows``). Avoid the visible 8-char - # prefix — the parquet's request_id column stores full rids. - fields = picked.split("\t") - import re - full_rid = fields[1] if len(fields) >= 2 else "" - if not re.fullmatch(r"[0-9a-f]{32}", full_rid): - return None - return full_rid - - -@cli.command("inspect") -@click.argument("target", required=False, shell_complete=_complete_request_ids) -@click.option( - "--rid", - "print_rid_only", - is_flag=True, - help="When picking interactively, print only the selected request-id " - "(so it can be captured or piped into another command).", -) -def inspect_cmd(target: str | None, print_rid_only: bool) -> None: - """Inspect captured requests. - - \b - - ``agentcap inspect`` pick from cwd workspace - - ``agentcap inspect `` pick from one run in cwd workspace - - ``agentcap inspect `` print the captured body - - ``agentcap inspect `` pick from another local workspace - - ``agentcap inspect .parquet`` pick from a captures parquet - - ``agentcap inspect hf://datasets//`` pick from an HF dataset - - ``agentcap inspect /`` same as above (shorthand) - - TARGET is classified by content (does the path exist? does it - look like an hf URI? all hex?). ``inspect`` does NOT consult - ``$AGENTCAP_WORKSPACE`` — what you point it at is what you get. - - The interactive pickers require fzf on PATH. - """ - import json as _json - - kind, payload = _classify_target(target) - - if kind == "rid": - # Body dump from cwd workspace. - cwd_ws = Path.cwd() / _WORKSPACE_DIR - full_rid, body, resp_rec, _, _ = _resolve_request_id( - payload, None, workspace=cwd_ws, # type: ignore[arg-type] - ) - if resp_rec is not None: - click.echo( - f" request_id={full_rid} " - f"captured_at={resp_rec.get('captured_at_resp', '?')} " - f"status={resp_rec.get('status_code', '?')}", - err=True, - ) - click.echo(_json.dumps(body, indent=2, ensure_ascii=False)) - return - - if kind == "workspace": - ws: Path = payload # type: ignore[assignment] - # run picker → request picker → message picker. Esc walks - # back one level at a time. - while True: - scope = _pick_workspace_run(workspace=ws) - if scope is None: - return - last_pick: str | None = None - while True: - pick = _pick_workspace_request( - scope, initial_short_rid=last_pick, workspace=ws, - ) - if pick is None: - break - last_pick = pick - if print_rid_only: - full_rid, _, _, _, _ = _resolve_request_id( - pick, None, workspace=ws, - ) - click.echo(full_rid) - return - _pick_request_message(pick, workspace=ws) - - if kind == "workspace-run": - ws = Path.cwd() / _WORKSPACE_DIR - scope = payload # type: ignore[assignment] - last_pick = None - while True: - pick = _pick_workspace_request( - scope, initial_short_rid=last_pick, workspace=ws, - ) - if pick is None: - return # explicit run-id on CLI; Esc → exit - last_pick = pick - if print_rid_only: - full_rid, _, _, _, _ = _resolve_request_id( - pick, None, workspace=ws, - ) - click.echo(full_rid) - return - _pick_request_message(pick, workspace=ws) - - if kind in ("parquet", "hf"): - # ``hf`` holds the picker's tempdir + row list at this scope - # so Esc-back re-entries are instant. - import contextlib as _contextlib - import logging as _logging - import tempfile as _tempfile - if kind == "hf": - # Mute huggingface_hub's retry warnings so transient hub - # slowness doesn't leak to the user's terminal. - _logging.getLogger("huggingface_hub").setLevel(_logging.ERROR) - from huggingface_hub import HfApi - hf_rows = _hf_list_parquets(payload) # type: ignore[arg-type] - if not hf_rows: - raise click.UsageError(f"no .parquet files in {payload}") - try: - hf_revision = HfApi().repo_info( - payload, repo_type="dataset", # type: ignore[arg-type] - ).sha - except Exception: # noqa: BLE001 - hf_revision = None - td_cm = _tempfile.TemporaryDirectory(prefix="agentcap-hf-meta-") - else: - td_cm = _contextlib.nullcontext(None) - hf_rows, hf_revision = [], None - with td_cm as td_str: - hf_tempdir = Path(td_str) if td_str else None - while True: - if kind == "hf": - pq_path = _pick_hf_dataset_parquet( - payload, hf_tempdir, hf_rows, hf_revision, # type: ignore[arg-type] - ) - if pq_path is None: - return # Esc on the parquet picker → exit - else: - pq_path = Path(payload) # type: ignore[arg-type] - pq_source = str(pq_path) - while True: - pick = _pick_parquet_request(pq_path) - if pick is None: - break # Esc on the request picker → back one level - if print_rid_only: - full_rid, _, _, _, _ = _resolve_request_id(pick, pq_source) - click.echo(full_rid) - return - _pick_request_message(pick, source=pq_source) - if kind == "parquet": - return # explicit parquet on CLI; Esc → exit - - -@cli.command("_run_preview", hidden=True) -@click.argument("run_id") -@click.option("--workspace", default=None, help="Workspace root (.agentcap dir).") -def _run_preview_cmd(run_id: str, workspace: str | None) -> None: - """Internal: preview a run's metadata for the run picker.""" - import json as _json - - root = Path(workspace) if workspace else _workspace_root() - run_dir = root / run_id - meta_path = run_dir / "run.json" - if not meta_path.is_file(): - click.echo(f"(no run.json at {meta_path})") - return - try: - meta = _json.loads(meta_path.read_text()) - except (OSError, _json.JSONDecodeError) as exc: - click.echo(f"(run.json unreadable: {exc})") - return - captures = run_dir / "captures" - n_caps = ( - len(list(captures.glob("*.request.json"))) if captures.is_dir() else 0 - ) - click.echo(f"run: {run_id}") - click.echo(f"agent: {meta.get('agent', '?')}") - click.echo(f"model: {meta.get('model', '?')}") - click.echo(f"upstream: {meta.get('upstream', '?')}") - click.echo(f"followup: {meta.get('followup', '?')}") - click.echo(f"turns/task: {meta.get('turns_per_task', '?')}") - click.echo(f"captures: {n_caps}") - click.echo() - click.echo("─── TASKS ───") - for t in meta.get("tasks") or []: - prompt = (t.get("prompt") or "").replace("\n", " ") - completed = t.get("completed_turns", "?") - click.echo(f" {t.get('task_id', '?')}: ({completed} turns) {prompt}") - - -def _message_key(m: dict) -> tuple: - """Canonical key for a ``messages[]`` entry. Compares only the - load-bearing fields (role/content/tool_call_id/tool_calls); ignores - optional metadata like the tool ``name`` field that some agents - include on one turn but not the next (notably hermes when it - re-serialises its session DB across turn boundaries).""" - import json as _json - c = m.get("content") - if isinstance(c, list): - c = _json.dumps(c, sort_keys=True) - tc = m.get("tool_calls") - tc_key = _json.dumps(tc, sort_keys=True) if tc else None - return (m.get("role"), c, m.get("tool_call_id"), tc_key) - - -def _diff_messages(prev: list, curr: list) -> tuple[list, list]: - """``(removed, added)`` — the suffixes of ``prev`` and ``curr`` that - diverge. Element-by-element so a length-equal turn boundary (where - an agent swaps a meta-prompt for the user's followup at the last - index) shows up as a real diff. Pure-append cases yield - ``removed=[]``; swaps yield non-empty removed AND added of equal - or unequal length depending on the truncation. - """ - prev_keys = [_message_key(m) for m in prev] - curr_keys = [_message_key(m) for m in curr] - n = min(len(prev_keys), len(curr_keys)) - i = n - for j in range(n): - if prev_keys[j] != curr_keys[j]: - i = j - break - return prev[i:], curr[i:] - - -def _delta_label(removed: int, added: int) -> str: - """Compact ``messages[]`` delta marker. Hides the removed count - when zero (the common pure-append case) so mid-loop rows stay - visually quiet; surfaces it for swaps (e.g. ``-1 +1``).""" - if removed: - return f"-{removed} +{added}" - return f"+{added}" - - -def _message_text(m: dict) -> str: - """Flatten ``message.content`` to a string. Tool / multimodal - messages carry list-typed content; join the text parts.""" - c = m.get("content") - if isinstance(c, list): - return " ".join( - p.get("text", "") for p in c if isinstance(p, dict) - ) - return c or "" - - -def _flatten(s: str, cap: int) -> str: - """Single-line, length-capped text. Without this, content with - embedded newlines (assistant prose, tool outputs) would blow up to - many visible lines and push later messages off fzf's preview - window.""" - s = " ".join(s.split()) - return s if len(s) <= cap else s[:cap] + "…" - - -_PICKER_SUMMARY_CAP = 160 -_PREVIEW_MSG_CAP = 400 - - -def _tag(label: str) -> str: - """Reverse-video the ``[label]`` marker that introduces each - preview line so the role boundaries are visually scannable across - many similar-looking rows.""" - return f"\033[7m[{label}]\033[0m" - - -def _message_summary(m: dict) -> str: - """One-line role-aware summary of one ``messages[]`` entry. Used - in the picker's MESSAGES column where we have ~one row to convey - 'what's new in this call'. Truncated so a large tool result can't - bloat the row.""" - role = (m or {}).get("role", "?") - if role == "assistant": - tcs = m.get("tool_calls") or [] - if tcs: - tc = tcs[0] - fn = (tc.get("function") or {}).get("name") or "?" - args = (tc.get("function") or {}).get("arguments") or "" - extra = f" +{len(tcs)-1}" if len(tcs) > 1 else "" - s = f"assistant→{fn}{extra} {args}" - else: - s = f"assistant: {_message_text(m)}" - elif role == "tool": - s = f"tool: {_message_text(m)}" - else: - s = f"{role}: {_message_text(m)}" - return _flatten(s, _PICKER_SUMMARY_CAP) - - -def _render_preview_message(m: dict) -> None: - """Render one ``messages[]`` entry into the inspect preview pane. - Each message stays on one line (newlines collapsed) so the diff - suffix remains visible inside fzf's 60% pane. ``color=True`` on - every echo: this command's stdout is captured by fzf's preview - subprocess (not a TTY), and click strips ANSI by default in that - case, which would silently swallow the reverse-video markers.""" - role = m.get("role", "?") - if role == "assistant": - for tc in m.get("tool_calls") or []: - fn = (tc.get("function") or {}).get("name") or "?" - args = (tc.get("function") or {}).get("arguments") or "" - click.echo( - f" {_tag(f'assistant tool_call → {fn}')} args={_flatten(args, 240)}", - color=True, - ) - content = _message_text(m) - if content: - click.echo( - f" {_tag('assistant content')} {_flatten(content, _PREVIEW_MSG_CAP)}", - color=True, - ) - return - if role == "tool": - tcid = (m.get("tool_call_id") or "?")[:8] - click.echo(f" {_tag(f'tool result, tool_call_id={tcid}')}", color=True) - click.echo(f" {_flatten(_message_text(m), _PREVIEW_MSG_CAP)}", color=True) - return - click.echo( - f" {_tag(role)} {_flatten(_message_text(m), _PREVIEW_MSG_CAP)}", - color=True, - ) - - -@cli.command("_preview", hidden=True) -@click.argument("request_id") -@click.argument("prev_request_id", required=False, default=None) -@click.option("--workspace", default=None, help="Workspace root (.agentcap dir).") -def _preview_cmd( - request_id: str, prev_request_id: str | None, workspace: str | None, -) -> None: - """Internal: header + initial PROMPT + MESSAGES diff for one - captured request — used by the fzf preview pane. - - Not part of the public CLI surface — hidden from ``--help``. The - user-facing inspector is ``agentcap inspect ``. - - ``prev_request_id`` is pushed in by the picker so the preview can - load the diff base directly instead of scanning the capture dir on - every fzf hover. Accepts ``"-"`` (or absent) for "no previous". - """ - import json as _json - import re - - # Hovered a section-header line in the picker — render nothing. - if not re.fullmatch(r"[0-9a-f]+", request_id): - click.echo("(section header — navigate to a request id)") - return - - ws = Path(workspace) if workspace else None - full_rid, body, resp_rec, req_rec, cap_dir = _resolve_request_id( - request_id, None, workspace=ws, - ) - messages = body.get("messages") or [] - initial_user = next( - (m for m in messages if m.get("role") == "user"), - None, - ) - initial_prompt = _message_text(initial_user or {}) - import time as _time - - status = ( - resp_rec.get("status_code") if resp_rec is not None else "?" - ) - serialized = _json.dumps(body, ensure_ascii=False) - size_b = len(serialized.encode("utf-8")) - task_id = (req_rec or {}).get("task_id") - turn = (req_rec or {}).get("turn") - captured_at = (req_rec or {}).get("captured_at") - ts = ( - _time.strftime("%H:%M:%S", _time.gmtime(int(captured_at))) - if captured_at else "?" - ) - # Load the diff base directly from the prev-rid file in the same - # capture dir (already known from ``_resolve_request_id`` above — - # no second workspace scan). The picker pushes the predecessor's - # rid in as ``prev_request_id``. Reject anything that isn't - # lowercase hex so a hand-crafted arg can't escape the capture - # dir via ``..`` or absolute paths. - prev_messages: list = [] - has_previous = False - if ( - cap_dir is not None - and prev_request_id - and prev_request_id != "-" - and re.fullmatch(r"[0-9a-f]+", prev_request_id) - ): - prev_path = cap_dir / f"{prev_request_id}.request.json" - if prev_path.is_file(): - try: - prev_rec = _json.loads(prev_path.read_text()) - prev_messages = (prev_rec.get("body") or {}).get("messages") or [] - has_previous = True - except (OSError, _json.JSONDecodeError): - pass - click.echo(f"rid: {full_rid}") - if task_id is not None or turn is not None: - click.echo(f"task: {task_id or '?'} turn={turn if turn is not None else '?'}") - click.echo(f"time: {ts}") - click.echo(f"status: {status}") - click.echo(f"model: {body.get('model', '?')}") - click.echo(f"size: {size_b:,} bytes (~{size_b // 4:,} tokens)") - click.echo() - click.echo("─── PROMPT ──────────────────────────────────────────────") - click.echo(initial_prompt or "(no user message)") - click.echo() - removed_messages, new_messages = _diff_messages(prev_messages, messages) - if has_previous: - header_suffix = ( - f"{_delta_label(len(removed_messages), len(new_messages))} " - f"since previous call" - ) - else: - n = len(new_messages) - header_suffix = f"initial: {n} msg{'' if n == 1 else 's'}" - click.echo(f"─── MESSAGES ({header_suffix}) ──────────") - if has_previous: - # Signals that the prior history (in prev_messages) was - # elided; what follows is the diff, not the whole conversation. - click.echo(" ...") - if not new_messages and not removed_messages: - click.echo("(no diff vs previous call)") - for m in new_messages: - _render_preview_message(m) - - -def _load_parquet_body(parquet_path: Path, rid: str) -> tuple[dict, dict, int | None, str | None]: - """Pull one request out of a captures parquet. Returns - ``(body, resp_rec, captured_at, run_id)``. - - The parquet's ``response`` column has two shapes depending on - whether the upstream streamed: - - - stream: ``{"stream": True, "raw": ""}`` - - non-stream: the bare OpenAI body dict (no wrapper) - - Workspace ``*.response.json`` records always have a ``body`` key, - and ``_decode_response`` follows that convention. Normalise the - non-stream parquet shape into ``{"stream": False, "body": ...}`` - here so callers (notably ``_decode_response`` / - ``_request_messages_for_view``) get the model reply rendered.""" - import json as _json - import pyarrow.parquet as pq - - t = pq.read_table( - str(parquet_path), - columns=["request_id", "captured_at", "request", "response", "run_id"], - filters=[("request_id", "=", rid)], - ) - if t.num_rows == 0: - return {}, {}, None, None - try: - body = _json.loads(t.column("request")[0].as_py() or "{}") - except _json.JSONDecodeError: - body = {} - try: - raw_resp = _json.loads(t.column("response")[0].as_py() or "{}") - except _json.JSONDecodeError: - raw_resp = {} - if raw_resp.get("stream"): - resp = raw_resp - else: - resp = {"stream": False, "body": raw_resp} - ts = t.column("captured_at")[0].as_py() - run_id = t.column("run_id")[0].as_py() - return body, resp, (int(ts) if ts is not None else None), run_id - - -@cli.command("_preview_parquet", hidden=True) -@click.argument("parquet_path") -@click.argument("request_id") -@click.argument("prev_request_id", required=False, default=None) -def _preview_parquet_cmd( - parquet_path: str, request_id: str, prev_request_id: str | None, -) -> None: - """Internal: same preview as ``_preview`` but sourced from a - parquet file. The picker passes the parquet path as a leading arg - so this hidden command stays stateless.""" - import json as _json - import re - import time as _time - - if not re.fullmatch(r"[0-9a-f]+", request_id): - click.echo("(section header — navigate to a request id)") - return - pq_path = Path(parquet_path) - body, resp, captured_at, run_id = _load_parquet_body(pq_path, request_id) - messages = body.get("messages") or [] - initial_user = next( - (m for m in messages if m.get("role") == "user"), - None, - ) - initial_prompt = _message_text(initial_user or {}) - status = resp.get("status_code", "?") if resp else "?" - serialized = _json.dumps(body, ensure_ascii=False) - size_b = len(serialized.encode("utf-8")) - ts = ( - _time.strftime("%H:%M:%S", _time.gmtime(captured_at)) - if captured_at else "?" - ) - - prev_messages: list = [] - has_previous = False - if ( - prev_request_id - and prev_request_id != "-" - and re.fullmatch(r"[0-9a-f]+", prev_request_id) - ): - prev_body, _, _, _ = _load_parquet_body(pq_path, prev_request_id) - prev_messages = prev_body.get("messages") or [] - has_previous = bool(prev_messages) - - click.echo(f"rid: {request_id}") - if run_id is not None: - click.echo(f"run: {run_id}") - click.echo(f"time: {ts}") - click.echo(f"status: {status}") - click.echo(f"model: {body.get('model', '?')}") - click.echo(f"size: {size_b:,} bytes (~{size_b // 4:,} tokens)") - click.echo() - click.echo("─── PROMPT ──────────────────────────────────────────────") - click.echo(initial_prompt or "(no user message)") - click.echo() - removed_messages, new_messages = _diff_messages(prev_messages, messages) - if has_previous: - header_suffix = ( - f"{_delta_label(len(removed_messages), len(new_messages))} " - f"since previous call" - ) - else: - n = len(new_messages) - header_suffix = f"initial: {n} msg{'' if n == 1 else 's'}" - click.echo(f"─── MESSAGES ({header_suffix}) ──────────") - if has_previous: - click.echo(" ...") - if not new_messages and not removed_messages: - click.echo("(no diff vs previous call)") - for m in new_messages: - _render_preview_message(m) - - -def _decode_sse_response(raw: str) -> dict: - """Decode an OpenAI-compatible SSE response stream into a single - synthesized assistant message: ``{content, tool_calls, - finish_reason}``. Concatenates ``delta.content`` chunks; merges - ``delta.tool_calls`` chunks by their ``index`` field (the first - chunk for an index carries id + function.name; later chunks - accumulate ``function.arguments`` string fragments).""" - import json as _json - content_parts: list[str] = [] - tool_calls_by_idx: dict[int, dict] = {} - finish_reason: str | None = None - for line in raw.splitlines(): - if not line.startswith("data:"): - continue - payload = line[len("data:"):].strip() - if not payload or payload == "[DONE]": - continue - try: - obj = _json.loads(payload) - except (_json.JSONDecodeError, ValueError): - continue - for ch in obj.get("choices") or []: - delta = ch.get("delta") or {} - if delta.get("content"): - content_parts.append(delta["content"]) - for tc_delta in delta.get("tool_calls") or []: - idx = tc_delta.get("index", 0) - slot = tool_calls_by_idx.setdefault(idx, { - "id": "", "type": "function", - "function": {"name": "", "arguments": ""}, - }) - if tc_delta.get("id"): - slot["id"] = tc_delta["id"] - if tc_delta.get("type"): - slot["type"] = tc_delta["type"] - fn = tc_delta.get("function") or {} - if fn.get("name"): - slot["function"]["name"] = fn["name"] - if fn.get("arguments"): - slot["function"]["arguments"] += fn["arguments"] - if ch.get("finish_reason"): - finish_reason = ch["finish_reason"] - return { - "content": "".join(content_parts), - "tool_calls": [tool_calls_by_idx[k] for k in sorted(tool_calls_by_idx)], - "finish_reason": finish_reason, - } - - -def _decode_response(resp_rec: dict) -> dict: - """Synthesize an assistant message from a response record. Handles - both non-stream (``body.choices[0].message``) and stream (raw SSE - bytes in ``raw``).""" - if resp_rec.get("stream"): - return _decode_sse_response(resp_rec.get("raw") or "") - body = resp_rec.get("body") or {} - ch = (body.get("choices") or [{}])[0] - msg = ch.get("message") or {} - return { - "content": msg.get("content") or "", - "tool_calls": msg.get("tool_calls") or [], - "finish_reason": ch.get("finish_reason"), - } - - -def _request_messages_for_view( - body: dict, resp_rec: dict | None -) -> list[dict]: - """Flatten ``messages[]`` + decoded response into one record per - picker row. Each assistant ``tool_calls`` produces its own row - followed (if present) by a row for the assistant's content; the - decoded model response is appended at the end as the final - assistant turn so the viewer shows the model's reply inline. - - Each record: ``{msg_idx, role, summary, content, ...}``. ``msg_idx`` - is the index into the original ``messages[]`` (or ``None`` for the - synthesized response rows).""" - records: list[dict] = [] - msgs = body.get("messages") or [] - for i, m in enumerate(msgs): - role = m.get("role", "?") - if role == "assistant": - for tc in m.get("tool_calls") or []: - fn = (tc.get("function") or {}).get("name") or "?" - args = (tc.get("function") or {}).get("arguments") or "" - records.append({ - "msg_idx": i, - "role": f"assistant→{fn}", - "summary": args, - "content": args, - "tool_call_id": tc.get("id"), - }) - content = _message_text(m) - if content: - records.append({ - "msg_idx": i, - "role": "assistant", - "summary": content, - "content": content, - }) - continue - if role == "tool": - content = _message_text(m) - records.append({ - "msg_idx": i, - "role": "tool", - "summary": content, - "content": content, - "tool_call_id": m.get("tool_call_id"), - }) - continue - content = _message_text(m) - records.append({ - "msg_idx": i, - "role": role, - "summary": content, - "content": content, - }) - if resp_rec is not None: - decoded = _decode_response(resp_rec) - for tc in decoded.get("tool_calls") or []: - fn = (tc.get("function") or {}).get("name") or "?" - args = (tc.get("function") or {}).get("arguments") or "" - records.append({ - "msg_idx": None, - "role": f"response→{fn}", - "summary": args, - "content": args, - "tool_call_id": tc.get("id"), - }) - content = decoded.get("content") or "" - if content: - records.append({ - "msg_idx": None, - "role": "response", - "summary": content, - "content": content, - "finish_reason": decoded.get("finish_reason"), - }) - return records - - -def _render_msg_preview(records: list[dict], row: int) -> None: - """Echo one entry from a message list — shared between the - workspace- and parquet-sourced ``_msg_preview*`` commands.""" - if row < 1 or row > len(records): - click.echo(f"(row {row} out of range; have {len(records)})") - return - rec = records[row - 1] - click.echo(f"role: {rec['role']}") - if rec.get("msg_idx") is not None: - click.echo(f"msg_idx: {rec['msg_idx']}") - else: - click.echo("msg_idx: (response)") - if rec.get("tool_call_id"): - click.echo(f"tool_call_id: {rec['tool_call_id']}") - if rec.get("finish_reason"): - click.echo(f"finish_reason: {rec['finish_reason']}") - click.echo() - click.echo(rec.get("content") or "(no content)") - - -@cli.command("_msg_preview", hidden=True) -@click.argument("request_id") -@click.argument("row", type=int) -@click.option("--workspace", default=None, help="Workspace root (.agentcap dir).") -def _msg_preview_cmd( - request_id: str, row: int, workspace: str | None, -) -> None: - """Internal: render one message (1-indexed ``row``) from the - request's flattened message list. Used by the workspace-sourced - message sub-picker.""" - import re - if not re.fullmatch(r"[0-9a-f]+", request_id): - click.echo("(invalid request id)") - return - ws = Path(workspace) if workspace else None - _, body, resp_rec, _, _ = _resolve_request_id( - request_id, None, workspace=ws, - ) - _render_msg_preview(_request_messages_for_view(body, resp_rec), row) - - -@cli.command("_msg_preview_parquet", hidden=True) -@click.argument("parquet_path") -@click.argument("request_id") -@click.argument("row", type=int) -def _msg_preview_parquet_cmd( - parquet_path: str, request_id: str, row: int, -) -> None: - """Internal: same as ``_msg_preview`` but sourced from a parquet. - Parquet ``response`` column is a JSON blob (no streaming wrapper), - so we pass it through unchanged — ``_decode_response`` handles - both the non-stream and the SSE-wrapped shapes.""" - import re - if not re.fullmatch(r"[0-9a-f]+", request_id): - click.echo("(invalid request id)") - return - body, resp, _, _ = _load_parquet_body(Path(parquet_path), request_id) - _render_msg_preview(_request_messages_for_view(body, resp or None), row) - - -def _pick_request_message( - rid: str, *, source: str | None = None, workspace: Path | None = None, -) -> None: - """Second-level fzf picker over the messages of the request the - user selected in the request picker. Read-only browse: Esc / Enter - both return to the caller without side effects. - - ``source`` is ``None`` for workspace-sourced rids (current - behaviour) and a local parquet path for parquet-sourced rids — the - preview pipeline shells out to a different hidden command in each - case so the picker doesn't need to know how the body was loaded. - ``workspace`` overrides the default workspace lookup for - workspace-sourced rids (inspect passes cwd or the resolved - dir explicitly).""" - import shlex - import sys - # ``_resolve_request_id`` returns ``resp_rec=None`` for any - # ``source`` (it calls ``captures.load_request`` which only loads the - # request body). For parquet sources, read the response back via - # ``_load_parquet_body`` so the message picker can show the model - # reply rows synthesised by ``_request_messages_for_view``. - if source and source.endswith(".parquet"): - body, resp_rec, _, _ = _load_parquet_body(Path(source), rid) - full_rid = rid - else: - full_rid, body, resp_rec, _, _ = _resolve_request_id( - rid, source, workspace=workspace, - ) - records = _request_messages_for_view(body, resp_rec) - if not records: - click.echo("(no messages in this request)") - return - role_w = max(len(r["role"]) for r in records) - lines: list[str] = [] - for i, rec in enumerate(records, start=1): - summary = _flatten(rec.get("summary") or "", 200) - display = f"[{i:>3}] {rec['role']:<{role_w}s} {summary}" - # Hidden tab-delimited column 2 carries the 1-indexed row. - # The preview reads it as ``{2}`` instead of computing - # ``$(({n} + 1))`` — the latter is POSIX-arithmetic and fzf - # runs previews via ``$SHELL -c``, so fish would break. - lines.append(f"{display}\t{i}") - header = f"messages for {full_rid[:8]} ({len(records)} entries)" - if source and source.endswith(".parquet"): - preview = ( - f"{sys.executable} -m agentcap _msg_preview_parquet" - f" {shlex.quote(source)} {full_rid} {{2}} 2>/dev/null" - f" | {sys.executable} -m agentcap _highlight {{q}}" - ) - else: - ws_arg = ( - f"--workspace {shlex.quote(str(workspace))} " - if workspace is not None else "" - ) - preview = ( - f"{sys.executable} -m agentcap _msg_preview {ws_arg}" - f"{full_rid} {{2}} 2>/dev/null" - f" | {sys.executable} -m agentcap _highlight {{q}}" - ) - _fzf_pick( - header, lines, preview, - extra_args=[ - "--delimiter", "\t", "--with-nth", "1", - "--no-hscroll", - "--bind", "change:refresh-preview", - ], - ) - - -def _parse_fzf_terms(query: str) -> list[str]: - """Split fzf's query into the literal text of each non-negated - term. Each term has its operator prefix (``'``, ``^``) and - trailing anchor (``$``) stripped so the remainder is the substring - to highlight. Negated terms (``!word``) and bare ``|`` OR - separators are skipped — they aren't substrings to colour.""" - terms: list[str] = [] - for raw in query.split(): - if raw in ("|", ""): - continue - if raw.startswith("!"): - continue - t = raw - if t and t[0] in ("'", "^"): - t = t[1:] - if t.endswith("$"): - t = t[:-1] - if t: - terms.append(t) - return terms - - -@cli.command("_highlight", hidden=True) -@click.argument("query") -def _highlight_cmd(query: str) -> None: - """Read stdin, write stdout with each (case-insensitive) literal - occurrence of every fzf search term in ``query`` wrapped in bold - red. Used by the inspect picker's preview pipeline so the user's - typed query is visible in the preview pane. - - Substring match per term — agrees with fzf's exact-match operator - (``'word``) and the default fuzzy mode when the fuzzy chars happen - to be contiguous. Operators ``'``, ``^``, ``$`` are stripped from - each term before matching; negated terms (``!word``) and ``|`` OR - separators are skipped (nothing to highlight). Special characters - in each term are escaped, so typing ``.``, ``[``, etc. is safe. - """ - import re - import sys - terms = _parse_fzf_terms(query) - if not terms: - sys.stdout.write(sys.stdin.read()) - return - # Longest terms first so a longer substring isn't shadowed by a - # shorter one that's a prefix of it. - terms.sort(key=len, reverse=True) - pat = re.compile( - "|".join(re.escape(t) for t in terms), re.IGNORECASE - ) - for line in sys.stdin: - sys.stdout.write( - pat.sub(lambda m: f"\033[1;31m{m.group(0)}\033[0m", line) - ) - - -def main() -> int: - cli.main(standalone_mode=True) - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/agentcap/captures.py b/src/agentcap/captures.py deleted file mode 100644 index 4925b3c..0000000 --- a/src/agentcap/captures.py +++ /dev/null @@ -1,190 +0,0 @@ -"""Resolve a captured request by id and hand back the body. - -No agentcap-side normalisation or mutation of the JSON object — captures -persist the request as parsed JSON, so the original byte sequence -(whitespace, key ordering) isn't recoverable, but the JSON object is. -Consumers that hit cross-server strictness do their own normalisation -(see AGENTS.md #3). -""" - -from __future__ import annotations - -import json -from pathlib import Path -from typing import Iterable - - -def load_request(source: str, request_id: str) -> dict: - """Return the raw captured request body for ``request_id``. - - ``source`` resolves any of: - - a local capture dir (``.request.json`` files), - - a local ``.parquet`` produced by ``agentcap export``, - - ``hf://datasets//`` or the bare ``/`` - form. - - Raises ``KeyError`` if the id is not found. - """ - return load_requests(source, [request_id])[request_id] - - -def load_requests( - source: str, request_ids: Iterable[str] -) -> dict[str, dict]: - """Batch form: one pass over the source per file, returns ``{id: body}``. - - Raises ``KeyError`` listing any ids that weren't found. - """ - wanted = set(request_ids) - if not wanted: - return {} - - # Resolve local paths first — an existing dir/file wins over the HF - # heuristic, so ``runs/abc/captures`` isn't misclassified as a repo. - p = Path(source).expanduser() - if p.is_dir(): - bodies = _load_from_capture_dir(p, wanted) - elif p.is_file() and p.suffix == ".parquet": - bodies = _load_from_parquet(p, wanted) - elif _looks_like_hf_source(source): - bodies = _load_from_hf_dataset(source, wanted) - else: - raise ValueError( - f"source must be a capture dir, a .parquet file, or an " - f"hf://datasets/... URI — got {source!r}" - ) - - missing = wanted - set(bodies) - if missing: - raise KeyError( - f"request_id(s) not found in {source!r}: {sorted(missing)}" - ) - return bodies - - -def _looks_like_hf_source(source: str) -> bool: - if source.startswith("hf://"): - return True - # Bare ``/`` — exactly one ``/`` and no path-separator - # prefix. Heuristic for distinguishing an HF repo from a local path. - if source.startswith((".", "/", "~")): - return False - return source.count("/") == 1 - - -def _load_from_capture_dir( - capture_dir: Path, wanted: set[str] -) -> dict[str, dict]: - out: dict[str, dict] = {} - for rid in wanted: - path = capture_dir / f"{rid}.request.json" - if not path.is_file(): - continue - rec = json.loads(path.read_text()) - body = rec.get("body") - if isinstance(body, dict): - out[rid] = body - return out - - -def _load_from_parquet( - parquet_path: Path, wanted: set[str] -) -> dict[str, dict]: - import pyarrow.parquet as pq - - table = pq.read_table( - str(parquet_path), - columns=["request_id", "request"], - filters=[("request_id", "in", list(wanted))], - ) - return _scan_arrow_table(table, wanted) - - -def _load_from_hf_dataset( - source: str, wanted: set[str] -) -> dict[str, dict]: - """Scan every parquet under ``data/`` in the dataset until all - wanted ids are found (or files exhausted).""" - import pyarrow.parquet as pq - from huggingface_hub import HfFileSystem - - s = source.removeprefix("hf://datasets/").strip("/") - parts = s.split("/") - if len(parts) != 2 or not parts[0] or not parts[1]: - raise ValueError( - f"hf source must be /, got {source!r}" - ) - repo_id = f"{parts[0]}/{parts[1]}" - fs = HfFileSystem() - prefix = f"datasets/{repo_id}/data" - - out: dict[str, dict] = {} - remaining = set(wanted) - for entry in fs.ls(prefix, detail=True): - if entry.get("type") != "file" or not entry["name"].endswith(".parquet"): - continue - with fs.open(entry["name"], "rb") as fh: - table = pq.read_table( - fh, - columns=["request_id", "request"], - filters=[("request_id", "in", list(remaining))], - ) - found = _scan_arrow_table(table, remaining) - out.update(found) - remaining -= set(found) - if not remaining: - break - return out - - -def _scan_arrow_table(table, wanted: set[str]) -> dict[str, dict]: - out: dict[str, dict] = {} - rid_col = table.column("request_id").to_pylist() - req_col = table.column("request").to_pylist() - for rid, req_str in zip(rid_col, req_col): - if rid in wanted and isinstance(req_str, str): - out[rid] = json.loads(req_str) - return out - - -class AmbiguousRequestId(Exception): - """Raised when a short rid prefix matches more than one captured - request — caller should ask the user to disambiguate (like - ``git`` does).""" - - def __init__(self, prefix: str, matches: list[str]): - self.prefix = prefix - self.matches = matches - super().__init__( - f"rid prefix {prefix!r} is ambiguous ({len(matches)} matches): " - f"{', '.join(sorted(matches)[:5])}{'…' if len(matches) > 5 else ''}" - ) - - -def resolve_workspace_rid( - workspace_root: Path, request_id: str -) -> tuple[Path, str] | None: - """Find the capture dir + full rid for a (possibly truncated) request id. - - Returns ``(capture_dir, full_rid)`` for the unique match, ``None`` if - no match. Raises ``AmbiguousRequestId`` when multiple rids share the - prefix. - """ - if not workspace_root.is_dir(): - return None - matches: list[tuple[Path, str]] = [] - for run_dir in workspace_root.iterdir(): - captures = run_dir / "captures" - if not captures.is_dir(): - continue - # Exact match shortcut — also makes full-length rids O(1). - exact = captures / f"{request_id}.request.json" - if exact.is_file(): - return captures, request_id - for hit in captures.glob(f"{request_id}*.request.json"): - matches.append((captures, hit.name.removesuffix(".request.json"))) - if not matches: - return None - if len(matches) > 1: - raise AmbiguousRequestId(request_id, [m[1] for m in matches]) - return matches[0] diff --git a/src/agentcap/drivers/__init__.py b/src/agentcap/drivers/__init__.py deleted file mode 100644 index ea15b46..0000000 --- a/src/agentcap/drivers/__init__.py +++ /dev/null @@ -1,174 +0,0 @@ -"""Agent driver adapters. - -A driver wraps an agent CLI (Hermes, OpenCode, …) so the orchestrator -can: - - - start a new session with an initial prompt, - - resume an existing session for a follow-up prompt, - - extract the final response text from each turn (for the - follow-up synthesizer). - -Drivers shell out to the agent's binary; they do not implement the -agent's semantics. Configuring the agent to point at the capture proxy -(via config file or env) is the orchestrator's responsibility. -""" - -from __future__ import annotations - -import abc -from dataclasses import dataclass, field -from typing import Callable - - -@dataclass -class AgentTurn: - """One turn of agent execution.""" - - session_id: str | None - response_text: str - returncode: int - stdout: str - stderr: str - #: ``": "`` for each errored tool call in stdout - #: (driver-specific parser). Empty if the driver has no parser yet. - tool_errors: list[str] = field(default_factory=list) - - -class AgentDriver(abc.ABC): - """Abstract adapter wrapping an agent CLI.""" - - name: str - - @abc.abstractmethod - def start( - self, - prompt: str, - *, - env: dict | None = None, - timeout: float | None = None, - ) -> AgentTurn: - """Start a new session with ``prompt``. Must populate - ``session_id`` if the agent supports resume.""" - - @abc.abstractmethod - def resume( - self, - prompt: str, - *, - session_id: str, - env: dict | None = None, - timeout: float | None = None, - ) -> AgentTurn: - """Continue session ``session_id`` with ``prompt``. Drivers - whose agent doesn't natively support resume must emulate it - (e.g. by replaying prior messages).""" - - -def _hermes_factory(**kwargs) -> AgentDriver: - from .hermes import HermesDriver - - return HermesDriver(**kwargs) - - -def _opencode_factory(**kwargs) -> AgentDriver: - from .opencode import OpenCodeDriver - - return OpenCodeDriver(**kwargs) - - -def _goose_factory(**kwargs) -> AgentDriver: - from .goose import GooseDriver - - return GooseDriver(**kwargs) - - -def _pi_factory(**kwargs) -> AgentDriver: - from .pi import PiDriver - - return PiDriver(**kwargs) - - -# Single source of truth for which agents the orchestrator supports. -# Adding a new driver: write the module + factory, append one entry -# here. Both ``get_driver`` and the ``--agent`` Click choice in -# ``__main__`` consume this — they cannot drift apart. -DRIVER_REGISTRY: dict[str, Callable[..., AgentDriver]] = { - "hermes": _hermes_factory, - "opencode": _opencode_factory, - "goose": _goose_factory, - "pi": _pi_factory, -} - - -def known_drivers() -> tuple[str, ...]: - """Names of registered driver adapters, in registration order. - - Used to populate ``agentcap run --agent`` choices and to enumerate - what's available without importing each driver module eagerly. - """ - return tuple(DRIVER_REGISTRY) - - -# Native session-trace surfacing. Two patterns: -# -# * **Symlink-the-dir** (pi, hermes): the agent writes one file per -# session into its native sessions dir. ``agentcap run`` bind-mounts -# ``/traces/`` and the image entrypoint symlinks the -# native dir at it, so transcripts land on the host as they're -# written. -# -# * **Post-run dump** (goose, opencode): the agent writes to a SQLite -# store; no per-session files exist on disk. The image ships a -# ``dump-traces`` script that lists sessions and exports each one -# via the agent's own CLI. The orchestrator calls it once after -# the corpus completes. -# -# The first column drives the in-container symlink at start-up -# (set in the per-agent ``agentcap-init.sh``); the second drives the -# post-corpus dump via :func:`traces_dump_argv_for`. -SESSIONS_PATH_IN_CONTAINER: dict[str, str] = { - "pi": "/opt/pi-config/sessions", -} - -# Agents whose images ship a ``dump-traces`` executable (on PATH). -# Called as ``sandbox.run(["dump-traces"])`` after the corpus to -# render SQLite-stored sessions into JSON/JSONL files under -# ``AGENTCAP_TRACES_DIR``. Symlink-style agents (pi) don't need it. -# -# hermes/goose/opencode all use SQLite session stores — there's no -# per-session file on disk to symlink, so we dump via the agent's -# own export CLI post-corpus. -_TRACES_DUMP_AGENTS: frozenset[str] = frozenset({"hermes", "goose", "opencode"}) - - -def sessions_path_for(agent: str) -> str | None: - return SESSIONS_PATH_IN_CONTAINER.get(agent) - - -def traces_dump_argv_for(agent: str) -> list[str] | None: - """In-container argv for the post-corpus trace dump, or None if - the agent surfaces traces through the symlink mechanism (no dump - step needed).""" - if agent in _TRACES_DUMP_AGENTS: - return ["dump-traces"] - return None - - -def get_driver(name: str, **kwargs) -> AgentDriver: - """Lookup a driver by short name.""" - try: - factory = DRIVER_REGISTRY[name] - except KeyError: - raise ValueError( - f"unknown driver: {name!r}; known: {', '.join(known_drivers())}" - ) from None - return factory(**kwargs) - - -__all__ = [ - "AgentDriver", - "AgentTurn", - "DRIVER_REGISTRY", - "get_driver", - "known_drivers", -] diff --git a/src/agentcap/drivers/goose.py b/src/agentcap/drivers/goose.py deleted file mode 100644 index 6da7c65..0000000 --- a/src/agentcap/drivers/goose.py +++ /dev/null @@ -1,125 +0,0 @@ -"""Goose driver. - -Drives ``goose run -t ""`` non-interactively. The proxy URL + -provider + ``OPENAI_API_KEY`` are baked into the per-agent image's -ENV (see [containers/agentcap-goose.Containerfile]( -../../../containers/agentcap-goose.Containerfile)); the driver only -sets ``GOOSE_MODEL`` per run. - -Goose's own session state lives at ``~/.config/goose/sessions/`` -inside the sandbox, redirected to the bind-mounted ``state/`` dir -so it survives ``podman run --rm`` boundaries between turns. -""" - -from __future__ import annotations - -import subprocess -import uuid -from pathlib import Path -from typing import Sequence - -from . import AgentDriver, AgentTurn -from ..sandbox import Sandbox - - -def parse_tool_errors(stdout: str) -> list[str]: - # TODO: goose's tool-error format is not yet characterised. - return [] - - -class GooseDriver(AgentDriver): - name = "goose" - - def __init__( - self, - *, - sandbox: Sandbox, - binary: str = "goose", - model: str | None = None, - cwd: Path | str | None = None, - extra_args: Sequence[str] = (), - ) -> None: - self.sandbox = sandbox - self.binary = binary - self.model = model - # ``cwd`` is sandbox-side: a host path bind-mounted into the - # container at the same path. - self.cwd = str(cwd) if cwd is not None else None - self.extra_args = list(extra_args) - - def close(self) -> None: - """No-op.""" - - def _build_argv( - self, prompt: str, *, session_name: str | None, resume: bool - ) -> list[str]: - argv = [self.binary, "run", "-t", prompt, *self.extra_args] - if session_name is None: - argv.append("--no-session") - else: - argv.extend(["--name", session_name]) - if resume: - argv.append("--resume") - return argv - - def _run( - self, - argv: list[str], - env: dict | None, - timeout: float | None, - ) -> subprocess.CompletedProcess: - full_env: dict[str, str] = {} - if self.model: - full_env["GOOSE_MODEL"] = self.model - if env: - full_env.update(env) - return self.sandbox.run( - argv, - env=full_env, - cwd=self.cwd, - timeout=timeout, - ) - - def start( - self, - prompt: str, - *, - env: dict | None = None, - timeout: float | None = None, - ) -> AgentTurn: - session_name = f"agentcap-{uuid.uuid4().hex[:8]}" - proc = self._run( - self._build_argv(prompt, session_name=session_name, resume=False), - env, - timeout, - ) - return AgentTurn( - session_id=session_name, - response_text=proc.stdout.strip(), - returncode=proc.returncode, - stdout=proc.stdout, - stderr=proc.stderr, - tool_errors=parse_tool_errors(proc.stdout), - ) - - def resume( - self, - prompt: str, - *, - session_id: str, - env: dict | None = None, - timeout: float | None = None, - ) -> AgentTurn: - proc = self._run( - self._build_argv(prompt, session_name=session_id, resume=True), - env, - timeout, - ) - return AgentTurn( - session_id=session_id, - response_text=proc.stdout.strip(), - returncode=proc.returncode, - stdout=proc.stdout, - stderr=proc.stderr, - tool_errors=parse_tool_errors(proc.stdout), - ) diff --git a/src/agentcap/drivers/hermes.py b/src/agentcap/drivers/hermes.py deleted file mode 100644 index 2ea34b1..0000000 --- a/src/agentcap/drivers/hermes.py +++ /dev/null @@ -1,204 +0,0 @@ -"""Hermes driver. - -Drives ``hermes chat -q ""`` non-interactively. ``~/.hermes/`` -is baked into the per-agent image with the proxy URL and context -length pointing at the in-process proxy — see -[containers/agentcap-hermes.Containerfile]( -../../../containers/agentcap-hermes.Containerfile). The driver does -no per-run config rewriting. - -Identity content (``SOUL.md``, etc.) and per-run state (``memories/``, -``sessions/``, ``logs/``) live under the image's ``/root/.hermes/``; -state-db symlinks redirect SQLite writes to the bind-mounted -``state/`` dir so session continuity survives ``podman run --rm`` -boundaries between turns. -""" - -from __future__ import annotations - -import re -import subprocess -from pathlib import Path -from typing import Sequence - -import yaml - -from . import AgentDriver, AgentTurn -from ..sandbox import Sandbox - - -_SESSION_ID_RE = re.compile(r"session_id:\s*([a-zA-Z0-9_\-]+)") -_RESUMED_MARKER = "Resumed" - - -def parse_session_id(output: str) -> str | None: - m = _SESSION_ID_RE.search(output) - return m.group(1) if m else None - - -def parse_tool_errors(stdout: str) -> list[str]: - # TODO: hermes' tool-error format is not yet characterised. - return [] - - -def parse_response_text(stdout: str) -> str: - """Extract the assistant body from a hermes run. - - For a resumed session, hermes prints a ``↻ Resumed `` marker - before the new turn — we slice everything after the last such - marker. For an initial run we use the whole stdout. Then strip - bare ``session_id:`` lines and surrounding whitespace. - """ - lines = stdout.splitlines() - last = -1 - for i, line in enumerate(lines): - if _RESUMED_MARKER in line and "↻" in line: - last = i - body_lines = lines[last + 1 :] if last >= 0 else lines - cleaned = [ - ln for ln in body_lines if not _SESSION_ID_RE.match(ln.strip()) - ] - return "\n".join(cleaned).strip() - - -def _rewrite_config( - config_text: str, - *, - base_url: str, - context_length_override: int | None = None, -) -> str: - """Round-trip a hermes ``config.yaml`` through PyYAML, overriding - ``model.base_url`` and (optionally) ``context_length``. Kept for - unit tests; the production path bakes the equivalent into the - image, so the driver never calls this at runtime.""" - cfg = yaml.safe_load(config_text) or {} - if not isinstance(cfg, dict): - raise ValueError("hermes config.yaml is not a YAML mapping") - - model = cfg.setdefault("model", {}) - if not isinstance(model, dict): - raise ValueError("hermes config.yaml: 'model' must be a mapping") - model["base_url"] = base_url - - if context_length_override is not None: - model["context_length"] = context_length_override - aux = cfg.setdefault("auxiliary", {}) - if not isinstance(aux, dict): - raise ValueError( - "hermes config.yaml: 'auxiliary' must be a mapping" - ) - comp = aux.setdefault("compression", {}) - if not isinstance(comp, dict): - raise ValueError( - "hermes config.yaml: 'auxiliary.compression' must be a mapping" - ) - comp["context_length"] = context_length_override - - return yaml.safe_dump(cfg, sort_keys=False) - - -class HermesDriver(AgentDriver): - name = "hermes" - - def __init__( - self, - *, - sandbox: Sandbox, - binary: str = "hermes", - model: str | None = None, - extra_args: Sequence[str] = ("-Q", "--yolo", "--accept-hooks"), - cwd: Path | str | None = None, - ignore_rules: bool = False, - toolsets: str | None = None, - ) -> None: - # cwd: sandbox-side working directory. Hermes auto-injects - # AGENTS.md / CLAUDE.md / .cursorrules from its cwd into every - # system prompt; the orchestrator typically passes the result - # of ``sandbox.mkdtemp`` so per-run cwd state doesn't leak. - # - # ignore_rules / toolsets shrink the default Hermes system - # prompt for CPU + small-model runs. - # - # model: passed via ``hermes chat -m ``. The CLI flag is - # the only path that reliably populates the ``model`` field - # in the outbound OAI request body; ``model.name`` in - # ``config.yaml`` doesn't propagate for every provider profile. - self.sandbox = sandbox - self.binary = binary - self.model = model - self.extra_args = list(extra_args) - self.cwd = str(cwd) if cwd is not None else None - self.ignore_rules = ignore_rules - self.toolsets = toolsets - - def close(self) -> None: - """No-op.""" - - def _build_argv( - self, prompt: str, *, session_id: str | None - ) -> list[str]: - argv = [self.binary, "chat", "-q", prompt, *self.extra_args] - if self.model: - argv.extend(["-m", self.model]) - if self.ignore_rules: - argv.append("--ignore-rules") - if self.toolsets: - argv.extend(["-t", self.toolsets]) - if session_id is None: - argv.append("--pass-session-id") - else: - argv.extend(["--resume", session_id]) - return argv - - def _run( - self, - argv: list[str], - env: dict | None, - timeout: float | None, - ) -> subprocess.CompletedProcess: - return self.sandbox.run( - argv, - env=env or {}, - cwd=self.cwd, - timeout=timeout, - ) - - def start( - self, - prompt: str, - *, - env: dict | None = None, - timeout: float | None = None, - ) -> AgentTurn: - proc = self._run( - self._build_argv(prompt, session_id=None), env, timeout - ) - combined = proc.stdout + "\n" + proc.stderr - return AgentTurn( - session_id=parse_session_id(combined), - response_text=parse_response_text(proc.stdout), - returncode=proc.returncode, - stdout=proc.stdout, - stderr=proc.stderr, - tool_errors=parse_tool_errors(proc.stdout), - ) - - def resume( - self, - prompt: str, - *, - session_id: str, - env: dict | None = None, - timeout: float | None = None, - ) -> AgentTurn: - proc = self._run( - self._build_argv(prompt, session_id=session_id), env, timeout - ) - return AgentTurn( - session_id=session_id, - response_text=parse_response_text(proc.stdout), - returncode=proc.returncode, - stdout=proc.stdout, - stderr=proc.stderr, - tool_errors=parse_tool_errors(proc.stdout), - ) diff --git a/src/agentcap/drivers/opencode.py b/src/agentcap/drivers/opencode.py deleted file mode 100644 index 34eaaae..0000000 --- a/src/agentcap/drivers/opencode.py +++ /dev/null @@ -1,236 +0,0 @@ -"""OpenCode driver. - -Drives ``opencode run --format json`` non-interactively. The provider -config (proxy URL, ``minimal`` agent definition) is baked into the -per-agent image at ``~/.config/opencode/opencode.json`` — see -[containers/agentcap-opencode.Containerfile]( -../../../containers/agentcap-opencode.Containerfile). The driver -passes the model id at the CLI (``--model local/``); session -continuity is via ``--session`` on resume. - -OpenCode emits NDJSON events on stdout when invoked with -``--format json``. ``text`` events carry assistant chunks; the -session id appears in every event as ``sessionID``. - -Always launch from a real project dir — opencode hangs ≥30 min if -the model directs it to recursively glob from filesystem root. -""" - -from __future__ import annotations - -import json -import subprocess -from pathlib import Path -from typing import Sequence - -from . import AgentDriver, AgentTurn -from ..sandbox import Sandbox - - -_DEFAULT_PROVIDER_NAME = "local" - - -def _iter_events(stdout: str): - for line in stdout.splitlines(): - line = line.strip() - if not line: - continue - try: - yield json.loads(line) - except json.JSONDecodeError: - continue - - -def parse_response_text(stdout: str) -> str: - """Concatenate ``text`` events from an opencode NDJSON stream.""" - parts: list[str] = [] - for obj in _iter_events(stdout): - if obj.get("type") == "text" and isinstance(obj.get("text"), str): - parts.append(obj["text"]) - return "".join(parts).strip() - - -def parse_session_id(stdout: str) -> str | None: - """Pull the first ``sessionID`` field out of the NDJSON stream.""" - for obj in _iter_events(stdout): - sid = obj.get("sessionID") - if isinstance(sid, str) and sid: - return sid - # Some events nest it under ``part``. - part = obj.get("part") - if isinstance(part, dict): - sid = part.get("sessionID") - if isinstance(sid, str) and sid: - return sid - return None - - -def parse_tool_errors(stdout: str) -> list[str]: - """Extract tool-call errors from opencode's NDJSON stream. - - Each ``tool_use`` event carries a ``part.state`` block with a - ``status`` field (``"completed"`` / ``"error"``) and, on error, - an ``error`` message + the failing ``input``. We surface every - error as ``": "`` so the caller can fail loud - rather than mistake a destructive or no-op tool call for a real - edit. - """ - errors: list[str] = [] - for obj in _iter_events(stdout): - if obj.get("type") != "tool_use": - continue - part = obj.get("part") or {} - state = part.get("state") or {} - if state.get("status") != "error": - continue - tool = part.get("tool") or "" - msg = state.get("error") or "(no error message)" - errors.append(f"{tool}: {msg}") - return errors - - -# Retained for tests and back-compat callers. Not used by OpenCodeDriver -# at runtime — the equivalent JSON is baked into the per-agent image. -_MINIMAL_AGENT_PROMPT = ( - "You are a coding assistant. Always make code changes by CALLING " - "the edit tool — do NOT just describe the change in prose. The " - "user's task is incomplete until your tool call actually modifies " - "the file. Use read first to see the current contents, then edit " - "to change them. Stop after a successful edit." -) - - -def build_opencode_config( - *, - provider_name: str, - base_url: str, - model_id: str, - context_window: int = 65536, - max_tokens: int = 8192, - minimal_agent: bool = False, -) -> dict: - """Render an ``opencode.json`` payload. Kept for tests; the - production path bakes the equivalent into the image.""" - cfg: dict = { - "$schema": "https://opencode.ai/config.json", - "provider": { - provider_name: { - "npm": "@ai-sdk/openai-compatible", - "name": f"Local via agentcap proxy ({base_url})", - "options": {"baseURL": base_url}, - "models": { - model_id: { - "name": model_id, - "options": {"max_tokens": max_tokens}, - "limit": {"context": context_window, "output": max_tokens}, - } - }, - } - }, - "model": f"{provider_name}/{model_id}", - } - if minimal_agent: - cfg["agent"] = { - "minimal": { - "description": "Stripped agent for CI / small-model CPU runs.", - # ``primary`` makes it selectable via ``--agent minimal``. - # Without ``mode``, opencode 1.15.x treats the agent as - # a subagent (@ autocomplete only) and the CLI flag - # falls through to "default agent". - "mode": "primary", - "model": f"{provider_name}/{model_id}", - "prompt": _MINIMAL_AGENT_PROMPT, - "permission": {"*": "deny", "read": "allow", "edit": "allow"}, - } - } - return cfg - - -class OpenCodeDriver(AgentDriver): - name = "opencode" - - def __init__( - self, - *, - sandbox: Sandbox, - binary: str = "opencode", - model: str | None = None, - cwd: Path | str | None = None, - provider_name: str = _DEFAULT_PROVIDER_NAME, - extra_args: Sequence[str] = (), - minimal_agent: bool = False, - ) -> None: - self.sandbox = sandbox - self.binary = binary - self.model = model - self.cwd = str(cwd) if cwd is not None else None - self.provider_name = provider_name - self.extra_args = list(extra_args) - self.minimal_agent = minimal_agent - - def close(self) -> None: - """No-op.""" - - def _build_argv( - self, prompt: str, *, session_id: str | None = None - ) -> list[str]: - argv = [self.binary, "run", "--format", "json"] - if self.model: - argv.extend(["--model", f"{self.provider_name}/{self.model}"]) - if self.minimal_agent: - argv.extend(["--agent", "minimal"]) - if session_id: - argv.extend(["--session", session_id]) - argv.extend(self.extra_args) - argv.append(prompt) - return argv - - def _run( - self, - argv: list[str], - env: dict | None, - timeout: float | None, - ) -> subprocess.CompletedProcess: - return self.sandbox.run( - argv, - env=env or {}, - cwd=self.cwd, - timeout=timeout, - ) - - def start( - self, - prompt: str, - *, - env: dict | None = None, - timeout: float | None = None, - ) -> AgentTurn: - proc = self._run(self._build_argv(prompt), env, timeout) - return AgentTurn( - session_id=parse_session_id(proc.stdout), - response_text=parse_response_text(proc.stdout), - returncode=proc.returncode, - stdout=proc.stdout, - stderr=proc.stderr, - tool_errors=parse_tool_errors(proc.stdout), - ) - - def resume( - self, - prompt: str, - *, - session_id: str, - env: dict | None = None, - timeout: float | None = None, - ) -> AgentTurn: - proc = self._run( - self._build_argv(prompt, session_id=session_id), env, timeout - ) - return AgentTurn( - session_id=parse_session_id(proc.stdout) or session_id, - response_text=parse_response_text(proc.stdout), - returncode=proc.returncode, - stdout=proc.stdout, - stderr=proc.stderr, - tool_errors=parse_tool_errors(proc.stdout), - ) diff --git a/src/agentcap/drivers/pi.py b/src/agentcap/drivers/pi.py deleted file mode 100644 index 776a70d..0000000 --- a/src/agentcap/drivers/pi.py +++ /dev/null @@ -1,183 +0,0 @@ -"""pi-mono coding-agent driver. - -Drives ``pi -p "" --provider local --model `` non- -interactively. The provider config (proxy URL, model entries) and -PI_CODING_AGENT_DIR are baked into the per-agent image — see -[containers/agentcap-pi.Containerfile]( -../../../containers/agentcap-pi.Containerfile). The driver passes -the model id at the CLI. - -Native sessions: pi tracks the most recent session under -``PI_CODING_AGENT_SESSION_DIR`` and resumes via ``--continue``. The -driver lets pi mint its own UUID on ``start`` (no flag), then passes -``--continue`` on ``resume``. The image's init script symlinks -``PI_CODING_AGENT_SESSION_DIR`` at the bind-mounted ``traces/`` dir -so session state survives ``podman run --rm`` boundaries. -""" - -from __future__ import annotations - -import subprocess -from pathlib import Path -from typing import Sequence - -from . import AgentDriver, AgentTurn -from ..sandbox import Sandbox - - -_DEFAULT_PROVIDER_NAME = "local" - - -def parse_tool_errors(stdout: str) -> list[str]: - # TODO: pi's tool-error format is not yet characterised. - return [] - - -def build_models_json( - *, - provider_name: str, - base_url: str, - model_id: str, - api_key_env: str = "PI_LOCAL_API_KEY", - context_window: int = 65536, - max_tokens: int = 4096, -) -> dict: - """Render a pi ``models.json`` payload. Kept for tests; the - production path bakes the equivalent into the image.""" - return { - "providers": { - provider_name: { - "baseUrl": base_url, - "api": "openai-completions", - "apiKey": api_key_env, - "compat": { - "supportsDeveloperRole": False, - "supportsReasoningEffort": False, - }, - "models": [ - { - "id": model_id, - "name": model_id, - "reasoning": False, - "input": ["text"], - "contextWindow": context_window, - "maxTokens": max_tokens, - "cost": { - "input": 0, "output": 0, - "cacheRead": 0, "cacheWrite": 0, - }, - } - ], - } - } - } - - -class PiDriver(AgentDriver): - name = "pi" - # In-container path where pi writes its native session files. - # Bind-mounted to ``/traces/`` by ``agentcap run`` so the - # agent's own trace ends up next to the proxy captures. - sessions_path = "/opt/pi-config/sessions" - - def __init__( - self, - *, - sandbox: Sandbox, - binary: str = "pi", - model: str | None = None, - cwd: Path | str | None = None, - provider_name: str = _DEFAULT_PROVIDER_NAME, - extra_args: Sequence[str] = (), - ) -> None: - self.sandbox = sandbox - self.binary = binary - self.model = model - self.cwd = str(cwd) if cwd is not None else None - self.provider_name = provider_name - self.extra_args = list(extra_args) - - def close(self) -> None: - """No-op.""" - - def _build_argv( - self, - prompt: str, - *, - resume: bool, - no_session: bool, - ) -> list[str]: - argv = [ - self.binary, - "-p", - prompt, - "--provider", - self.provider_name, - *self.extra_args, - ] - if self.model: - argv.extend(["--model", self.model]) - if no_session: - argv.append("--no-session") - elif resume: - argv.append("--continue") - return argv - - def _run( - self, - argv: list[str], - env: dict | None, - timeout: float | None, - ) -> subprocess.CompletedProcess: - return self.sandbox.run( - argv, - env=env or {}, - cwd=self.cwd, - timeout=timeout, - ) - - def start( - self, - prompt: str, - *, - env: dict | None = None, - timeout: float | None = None, - ) -> AgentTurn: - # No --session: pi mints its own UUID and writes it under - # PI_CODING_AGENT_SESSION_DIR. Resume picks the latest via - # --continue (synthetic marker returned to the orchestrator). - proc = self._run( - self._build_argv(prompt, resume=False, no_session=False), - env, - timeout, - ) - return AgentTurn( - session_id="latest", - response_text=proc.stdout.strip(), - returncode=proc.returncode, - stdout=proc.stdout, - stderr=proc.stderr, - tool_errors=parse_tool_errors(proc.stdout), - ) - - def resume( - self, - prompt: str, - *, - session_id: str, - env: dict | None = None, - timeout: float | None = None, - ) -> AgentTurn: - proc = self._run( - self._build_argv(prompt, resume=True, no_session=False), - env, - timeout, - ) - return AgentTurn( - session_id=session_id, - response_text=proc.stdout.strip(), - returncode=proc.returncode, - stdout=proc.stdout, - stderr=proc.stderr, - tool_errors=parse_tool_errors(proc.stdout), - ) diff --git a/src/agentcap/export.py b/src/agentcap/export.py deleted file mode 100644 index 76d1b64..0000000 --- a/src/agentcap/export.py +++ /dev/null @@ -1,686 +0,0 @@ -"""Capture dir → parquet export. - -For each ``.request.json``, pair with the matching -``.response.json`` and emit one parquet row. - -Destination: ``--push /[/]`` — uploaded into a -Hugging Face Dataset repo. Files under ``data/`` get the Hub Dataset -Viewer automatically. -""" - -from __future__ import annotations - -import json -from pathlib import Path -from typing import Iterator - -from .provider import _hostname_fallback, refine_for_sub_provider - - -def detect_provider_columns(capture_dir: Path | str) -> dict: - """Derive ``provider`` + ``upstream_url`` from the per-request - ``upstream_url`` stamp. Empty dict for legacy capture dirs missing - the stamp.""" - for req_path in sorted(Path(capture_dir).glob("*.request.json")): - try: - rec = json.loads(req_path.read_text()) - except (OSError, json.JSONDecodeError): - continue - upstream_url = rec.get("upstream_url") - if not isinstance(upstream_url, str) or not upstream_url: - continue - model = (rec.get("body") or {}).get("model") - provider = refine_for_sub_provider( - _hostname_fallback(upstream_url), - model if isinstance(model, str) else None, - ) - return {"provider": provider, "upstream_url": upstream_url} - return {} - - -def detect_model(capture_dir: Path | str) -> str | None: - """Unique ``body.model`` across all captured requests, or ``None``. - Raises ``ValueError`` on mixed models (datasets never mix models). - ``@revision`` suffixes are stripped.""" - capture_dir = Path(capture_dir) - seen: set[str] = set() - for req_path in sorted(capture_dir.glob("*.request.json")): - try: - rec = json.loads(req_path.read_text()) - except (OSError, json.JSONDecodeError): - continue - m = (rec.get("body") or {}).get("model") - if isinstance(m, str) and m: - seen.add(_bare_model_id(m)) - if len(seen) > 1: - raise ValueError( - f"capture dir contains requests for multiple models: " - f"{sorted(seen)}. Datasets never mix models — split into " - f"separate capture dirs and export each one independently." - ) - return seen.pop() if seen else None - - -def _bare_model_id(model: str) -> str: - """Strip ``@revision`` suffix so ``gemma-4-E4B-it`` and - ``gemma-4-E4B-it@main`` are treated as the same id.""" - return model.split("@", 1)[0] - - -def _iter_pairs( - capture_dir: Path, -) -> Iterator[tuple[str, dict, dict | None, int, dict, str | None, int | None]]: - """Yield (request_id, request_body, response_body, captured_at, - upstream_fingerprint, task_id, turn) per captured request, in - filename order. ``task_id`` / ``turn`` come from the wrapping - ``.request.json`` record (orchestrator-side metadata that isn't - inside the OpenAI body) — preserving them in the parquet lets - downstream picker UIs group + index rows without having to fall - back to ``-``.""" - for req_path in sorted(capture_dir.glob("*.request.json")): - rec = json.loads(req_path.read_text()) - rid = rec.get("request_id") or req_path.stem.split(".")[0] - captured_at = int(rec.get("captured_at", 0)) - body = rec.get("body") or {} - task_id = rec.get("task_id") - turn = rec.get("turn") - resp_path = capture_dir / f"{rid}.response.json" - resp_body: dict | None = None - upstream_fp: dict = {} - if resp_path.exists(): - resp_rec = json.loads(resp_path.read_text()) - upstream_fp = resp_rec.get("upstream_fingerprint") or {} - if resp_rec.get("stream"): - resp_body = {"stream": True, "raw": resp_rec.get("raw", "")} - else: - resp_body = resp_rec.get("body") or {} - yield rid, body, resp_body, captured_at, upstream_fp, task_id, turn - - -def _fingerprint_columns(fp: dict | None) -> dict: - fp = fp or {} - return { - "served_by": fp.get("x_served_by"), - "served_build_info": fp.get("build_info"), - "served_model": fp.get("served_model"), - } - - -def _row( - request_id: str, - request_body: dict, - response_body: dict | None, - captured_at: int, - upstream_fp: dict | None, - task_id: str | None = None, - turn: int | None = None, -) -> dict: - # request / response stringified so Arrow doesn't infer a schema over - # heterogeneous tool-schema fields. Consumers json.loads them. - model = (request_body.get("model") or "") if isinstance(request_body, dict) else "" - return { - "request_id": request_id, - "model": model, - "captured_at": captured_at, - "task_id": task_id, - "turn": turn, - "request": json.dumps(request_body, ensure_ascii=False), - "response": json.dumps(response_body or {}, ensure_ascii=False), - **_fingerprint_columns(upstream_fp), - } - - -def export_local( - capture_dir: Path | str, - output: Path | str, - *, - batch_size: int = 32, - progress: bool = True, - provider_columns: dict | None = None, - agent: str | None = None, - model: str | None = None, -) -> int: - """Stream the capture dir into a single parquet. Returns row count. - Batches via ``ParquetWriter`` so a mid-render kill leaves a valid - parquet up to the last flushed batch. - - ``agent`` and ``model`` are stamped into the parquet's schema-level - KV metadata so downstream consumers (``agentcap inspect``'s picker - in particular) can label each parquet without re-parsing the - filename — that filename is a brittle contract, the KV metadata is - the authoritative source.""" - import pyarrow as pa - import pyarrow.parquet as pq - - capture_dir = Path(capture_dir) - output = Path(output) - output.parent.mkdir(parents=True, exist_ok=True) - if provider_columns is None: - provider_columns = detect_provider_columns(capture_dir) - - request_files = sorted(capture_dir.glob("*.request.json")) - total = len(request_files) - if total == 0: - raise ValueError(f"no captured requests in {capture_dir}") - - pairs_iter = _iter_pairs(capture_dir) - if progress: - try: - from tqdm import tqdm - pairs_iter = tqdm( - pairs_iter, - total=total, - desc=f"export {capture_dir.name}", - unit="row", - ) - except ImportError: - pass - - writer: pq.ParquetWriter | None = None - schema: pa.Schema | None = None - batch: list[dict] = [] - n_written = 0 - # ``tasks_buf`` accumulates {task_id → (max-turn, first-user-prompt)} - # across all batches so the parquet's schema KV ends up with an - # accurate, complete task list. We post-process the parquet at the - # end to stamp it (the streaming writer's schema is fixed at open). - tasks_buf: dict[str, dict] = {} - - def _absorb_tasks(rows: list[dict]) -> None: - for r in rows: - tid = r.get("task_id") - if not tid: - continue - d = tasks_buf.setdefault(tid, {"turns": 0, "prompt": None}) - turn = r.get("turn") - if turn is not None and int(turn) > d["turns"]: - d["turns"] = int(turn) - if d["prompt"] is None: - try: - body = json.loads(r.get("request") or "{}") - except (json.JSONDecodeError, ValueError, TypeError): - body = {} - for m in body.get("messages") or []: - if m.get("role") == "user": - content = m.get("content") or "" - if isinstance(content, list): - content = " ".join( - c.get("text", "") for c in content - if isinstance(c, dict) - ) - d["prompt"] = ( - (content or "").replace("\n", " ").strip()[:200] - ) - break - - def _flush(rows: list[dict]) -> None: - nonlocal writer, schema, n_written - if not rows: - return - if provider_columns: - for r in rows: - for k, v in provider_columns.items(): - r.setdefault(k, v) - _absorb_tasks(rows) - table = pa.Table.from_pylist(rows) - # ``task_id`` / ``turn`` are optional orchestrator metadata. - # If the first batch's values are all ``None``, Arrow infers - # ``null`` for the column type and the writer's schema locks - # that in — every later batch with non-null values then fails - # ``table.cast(schema)``. Force the canonical dtypes up front - # so the first-batch dtype matches what later batches will - # carry. - for col, dtype in (("task_id", pa.string()), ("turn", pa.int64())): - if col not in table.schema.names: - continue - field = table.schema.field(col) - if pa.types.is_null(field.type): - idx = table.schema.get_field_index(col) - table = table.set_column( - idx, col, pa.array([None] * table.num_rows, type=dtype), - ) - if writer is None: - kv = { - k.encode(): v.encode() - for k, v in (("agent", agent), ("model", model)) - if v - } - schema = table.schema.with_metadata(kv) if kv else table.schema - writer = pq.ParquetWriter(str(output), schema) - else: - table = table.cast(schema) - writer.write_table(table) - n_written += len(rows) - - try: - for rid, body, resp, captured_at, upstream_fp, task_id, turn in pairs_iter: - batch.append( - _row(rid, body, resp, captured_at, upstream_fp, task_id, turn) - ) - if len(batch) >= batch_size: - _flush(batch) - batch = [] - _flush(batch) - finally: - if writer is not None: - writer.close() - - # The streaming writer freezes the schema at open, so we can't - # stamp ``tasks`` until we've consumed every row. - if tasks_buf: - tasks_list = [ - {"id": tid, "turns": d["turns"], "prompt": d["prompt"]} - for tid, d in sorted(tasks_buf.items()) - ] - table = pq.read_table(str(output)) - kv = dict(table.schema.metadata or {}) - kv[b"tasks"] = json.dumps(tasks_list, ensure_ascii=False).encode() - # ``Path.replace`` is POSIX-atomic — a kill mid-rewrite leaves - # the original intact. - tmp = output.with_suffix(output.suffix + ".rewrite") - try: - pq.write_table(table.replace_schema_metadata(kv), str(tmp)) - tmp.replace(output) - except Exception: - if tmp.exists(): - tmp.unlink() - raise - - return n_written - - -def parse_collection_base(uri: str) -> tuple[str, str]: - """Split ``/`` (optionally prefixed with - ``hf://datasets/``) into ``("", "")``. - - ```` drives all three artifacts: captures dataset - ``/-captures``, traces dataset ``/-traces``, - and the HF Collection of the same title under ````.""" - s = uri.removeprefix("hf://datasets/").strip("/") - parts = s.split("/") - if len(parts) != 2 or not parts[0] or not parts[1]: - raise ValueError( - f"--push must be /, got {uri!r}" - ) - return parts[0], parts[1] - - -def captures_repo_id(owner: str, base: str) -> str: - return f"{owner}/{base}-captures" - - -_FILENAME_SAFE = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_." - - -def _slug(s: str) -> str: - """Filename-safe slug. Strips ``org/`` prefix from HF model ids.""" - s = s.split("/")[-1] - out = "".join(c if c in _FILENAME_SAFE else "-" for c in s) - while "--" in out: - out = out.replace("--", "-") - return out.strip("-_.") or "x" - - -def _default_filename( - agent: str | None = None, - model: str | None = None, - provider: str | None = None, -) -> str: - """``train-[---]-.parquet``.""" - import time - import uuid - - ts = time.strftime("%Y%m%dT%H%M%S", time.gmtime()) - parts = ["train"] - if agent: - parts.append(_slug(agent)) - if model: - parts.append(_slug(model)) - if provider: - # Preserve hf-router/fireworks-ai → hf-router-fireworks-ai - # (``_slug`` would otherwise strip everything before the last /). - parts.append(_slug(provider.replace("/", "-"))) - parts.append(ts) - parts.append(uuid.uuid4().hex[:6]) - return "-".join(parts) + ".parquet" - - -_CAPTURES_README_TEMPLATE = """\ ---- -license: apache-2.0 -tags: -- agentcap -- agentcap-captures ---- - -# {repo_id} - -HTTP captures of agent ↔ model interactions — one parquet row per -`/v1/chat/completions` call. Produced by -[agentcap](https://github.com/huggingface/agentcap). - -Native session traces for the same runs live in companion datasets -named `{base}--traces`. They're all grouped under the -[{collection_title} Collection](https://huggingface.co/{owner}) -alongside this dataset. Join on `run_id`. - -## Loading - -```python -from datasets import load_dataset - -ds = load_dataset("{repo_id}", split="train") -``` - -## Schema - -| column | description | -|---|---| -| `run_id` | agentcap run id; matches the per-run folder in the traces dataset | -| `request_id` | UUID minted by the capture proxy | -| `model` | Model id from the captured request body | -| `captured_at` | Epoch seconds when the request was captured | -| `request` | Raw OpenAI request body, JSON-stringified | -| `response` | Raw OpenAI response body, JSON-stringified (or `{{"stream": true, "raw": ...}}` for SSE) | -| `served_by` | Per-response `X-Served-By` header (HF Router sub-provider routing) | -| `served_build_info` | Per-response `X-Build-Info` header | -| `served_model` | Per-response body-echoed `model` | -| `provider` | Derived from the proxy upstream URL (constant per file) | -| `upstream_url` | Proxy upstream URL at capture time (constant per file) | - -`request` and `response` are JSON strings; consumers `json.loads(...)` -them. To recover per-message token ranges, render `request.messages` -through the model's chat template yourself — -`transformers.AutoTokenizer.apply_chat_template`. -""" - - -_TRACES_README_TEMPLATE = """\ ---- -license: apache-2.0 -tags: -- agent-traces -- agentcap -- agentcap-traces -- agentcap-traces-{agent} -source_datasets: -- {captures_repo} ---- - -# {repo_id} - -{agent} coding-agent session traces produced by -[agentcap](https://github.com/huggingface/agentcap) runs. Each run -contributes one folder under `data//`; inside, one file per -session in `{agent}`'s native export format. - -The on-the-wire HTTP captures for these same runs live in -[{captures_repo}](https://huggingface.co/datasets/{captures_repo}). -Both belong to the -[{collection_title} Collection](https://huggingface.co/{owner}) -— join on `run_id` to align captures with traces. -""" - - -def traces_repo_id_for(owner: str, base: str, agent: str) -> str: - """Per-agent traces dataset id. One agent per dataset keeps the - schema homogeneous — the Hub viewer can't reconcile pi's - type-discriminated events with goose's session-as-object dump.""" - return f"{owner}/{base}-{agent}-traces" - - -def _captures_readme( - *, - repo_id: str, - owner: str, - base: str, - collection_title: str, -) -> str: - return _CAPTURES_README_TEMPLATE.format( - repo_id=repo_id, - owner=owner, - base=base, - collection_title=collection_title, - ) - - -def _traces_readme( - *, - repo_id: str, - captures_repo: str, - owner: str, - collection_title: str, - agent: str, -) -> str: - return _TRACES_README_TEMPLATE.format( - repo_id=repo_id, - captures_repo=captures_repo, - owner=owner, - collection_title=collection_title, - agent=agent, - ) - - -def push_captures_dataset( - items: list[dict], - *, - owner: str, - base: str, -) -> tuple[str, list[int]]: - """Render N capture dirs to parquet under ``/-captures`` - in a single commit. Returns ``(repo_id, [n_rows...])``. - - ``items`` is a list of dicts, each with: - - ``capture_dir`` (required): path to a capture dir - - ``model`` (required): model id used in the default filename - - ``agent`` (optional): agent name embedded in the default filename - - ``run_id`` (optional): stamped onto every row + into the filename - - ``filename`` (optional): overrides the default unique name - - The repo is created on first push (``exist_ok=True``); files land - under ``data/.parquet`` so the Hub Dataset Viewer picks - them up automatically. - """ - import tempfile - - from huggingface_hub import CommitOperationAdd, HfApi - - repo_id = captures_repo_id(owner, base) - api = HfApi() - api.create_repo( - repo_id=repo_id, repo_type="dataset", - private=True, exist_ok=True, - ) - - # Seed a dataset card on first push (no README in the repo yet). - # Later pushes leave any existing README alone — including - # user-edited ones. - try: - existing = set(api.list_repo_files(repo_id, repo_type="dataset")) - except Exception: - existing = set() - include_readme = "README.md" not in existing - - n_rows_list: list[int] = [] - with tempfile.TemporaryDirectory() as tmpdir: - operations: list[CommitOperationAdd] = [] - if include_readme: - operations.append(CommitOperationAdd( - path_in_repo="README.md", - path_or_fileobj=_captures_readme( - repo_id=repo_id, - owner=owner, - base=base, - collection_title=base, - ).encode("utf-8"), - )) - for i, item in enumerate(items): - cap_dir = item["capture_dir"] - model = item["model"] - agent = item.get("agent") - run_id = item.get("run_id") - filename = item.get("filename") - provider_columns = detect_provider_columns(cap_dir) - extra_columns = dict(provider_columns) - if run_id: - extra_columns["run_id"] = run_id - if filename is None: - filename = _default_filename( - agent=agent, - model=model, - provider=provider_columns.get("provider") or None, - ) - path_in_repo = f"data/{filename}" - local_file = Path(tmpdir) / f"{i}-{filename}" - n_rows = export_local( - cap_dir, local_file, provider_columns=extra_columns, - progress=False, agent=agent, model=model, - ) - n_rows_list.append(n_rows) - operations.append(CommitOperationAdd( - path_in_repo=path_in_repo, - path_or_fileobj=str(local_file), - )) - - api.create_commit( - repo_id=repo_id, - repo_type="dataset", - operations=operations, - commit_message=f"agentcap export: add {len(operations)} parquet(s)", - ) - - return repo_id, n_rows_list - - -def push_agent_traces_dataset( - items: list[dict], - *, - owner: str, - base: str, - agent: str, -) -> tuple[str, int]: - """Upload raw trace files for ONE agent under - ``/--traces`` in a single commit. Returns - ``(repo_id, n_files_total)``. - - ``items`` is a list of dicts, each with: - - ``traces_dir`` (required): path to a ``/traces/`` dir - - ``run_id`` (required): folder name in the dataset repo - - Splitting by agent (one dataset per agent) keeps each dataset's - schema homogeneous — the Hub viewer can't reconcile pi's - type-discriminated events with goose's session-as-object dump. - - Files are uploaded **as-is** — no JSON parsing, no schema - transformation. Empty trace dirs contribute 0 files. Returns 0 - files when the entire item list has no files; the repo is still - created so the collection link stays consistent. - """ - from huggingface_hub import CommitOperationAdd, HfApi - - repo_id = traces_repo_id_for(owner, base, agent) - captures_repo = captures_repo_id(owner, base) - api = HfApi() - api.create_repo( - repo_id=repo_id, repo_type="dataset", - private=True, exist_ok=True, - ) - - try: - existing = set(api.list_repo_files(repo_id, repo_type="dataset")) - except Exception: - existing = set() - include_readme = "README.md" not in existing - - operations: list[CommitOperationAdd] = [] - if include_readme: - operations.append(CommitOperationAdd( - path_in_repo="README.md", - path_or_fileobj=_traces_readme( - repo_id=repo_id, - captures_repo=captures_repo, - owner=owner, - collection_title=base, - agent=agent, - ).encode("utf-8"), - )) - - n_files = 0 - for item in items: - traces_dir = Path(item["traces_dir"]) - run_id = item["run_id"] - if not traces_dir.is_dir(): - continue - for f in sorted(p for p in traces_dir.iterdir() if p.is_file()): - operations.append(CommitOperationAdd( - path_in_repo=f"data/{run_id}/{f.name}", - path_or_fileobj=str(f), - )) - n_files += 1 - - # Only commit if we have something to add. If even the README is - # already up, skip the empty commit silently. - if not operations: - return repo_id, n_files - api.create_commit( - repo_id=repo_id, - repo_type="dataset", - operations=operations, - commit_message=( - f"agentcap export: add {agent} traces " - f"({n_files} file(s) across {len(items)} run(s))" - ), - ) - - return repo_id, n_files - - -def ensure_collection( - *, - owner: str, - base: str, - repos: list[str], -) -> str: - """Find-or-create the ``/`` collection and ensure every - repo in ``repos`` is an item. Returns the collection slug. - - Idempotent: existing items are kept (``exists_ok=True``).""" - from huggingface_hub import HfApi - - api = HfApi() - slug: str | None = None - try: - for c in api.list_collections(owner=owner, q=base, limit=20): - if c.title == base: - slug = c.slug - break - except Exception: - slug = None - - if slug is None: - col = api.create_collection( - title=base, - namespace=owner, - description=( - "agentcap: paired HTTP captures + native session " - "traces. Join on run_id." - ), - private=True, - exists_ok=True, - ) - slug = col.slug - - for repo in repos: - try: - api.add_collection_item( - collection_slug=slug, - item_id=repo, - item_type="dataset", - exists_ok=True, - ) - except Exception: - # Item-add isn't load-bearing — the README cross-links - # already make the relationship discoverable. Keep going. - pass - - return slug diff --git a/src/agentcap/followups/__init__.py b/src/agentcap/followups/__init__.py deleted file mode 100644 index 09f2c0c..0000000 --- a/src/agentcap/followups/__init__.py +++ /dev/null @@ -1,59 +0,0 @@ -"""Follow-up strategies for multi-turn agent runs. - -Each strategy implements ``FollowUp.next(...)`` returning the next -user message to feed to the agent given the prior turn's response and -the original task. Strategies are stateful (``templates`` rotates a -pool, ``synthesized`` may keep a model client) but the contract is -the same. - -Three built-in strategies, in increasing order of cost / realism: - - - ``continue`` (default): the literal string ``"continue"``. Cheapest - and maximises cross-session match opportunity since user-message - tokens are byte-identical across sessions. - - ``templates``: rotates through a small pool (``"continue"``, - ``"go on"``, ``"what else?"``, ``"keep going"``). - - ``synthesized``: feeds (original task + agent's last response) - into a separate model call to produce a realistic follow-up. The - synthesizer call **bypasses the capture proxy** by design — its - requests are not part of the capture. -""" - -from __future__ import annotations - -import abc - - -class FollowUp(abc.ABC): - """Strategy for picking the next user message in a multi-turn run.""" - - name: str - - @abc.abstractmethod - def next(self, *, original_task: str, last_response: str, turn: int) -> str: - """Return the next user message. - - ``turn`` is the 1-indexed number of the *upcoming* turn (so the - first follow-up is ``turn=2`` because the original task was - turn 1). Strategies that don't care about ``turn`` simply ignore - the arg. - """ - - -def get_followup(name: str, **kwargs) -> FollowUp: - if name == "continue": - from .continue_ import ContinueFollowUp - - return ContinueFollowUp(**kwargs) - if name == "templates": - from .templates import TemplatesFollowUp - - return TemplatesFollowUp(**kwargs) - if name == "synthesized": - from .synthesized import SynthesizedFollowUp - - return SynthesizedFollowUp(**kwargs) - raise ValueError(f"unknown follow-up strategy: {name!r}") - - -__all__ = ["FollowUp", "get_followup"] diff --git a/src/agentcap/followups/continue_.py b/src/agentcap/followups/continue_.py deleted file mode 100644 index f64dd07..0000000 --- a/src/agentcap/followups/continue_.py +++ /dev/null @@ -1,15 +0,0 @@ -"""Literal-``continue`` follow-up strategy.""" - -from __future__ import annotations - -from . import FollowUp - - -class ContinueFollowUp(FollowUp): - name = "continue" - - def __init__(self, text: str = "continue") -> None: - self.text = text - - def next(self, *, original_task: str, last_response: str, turn: int) -> str: - return self.text diff --git a/src/agentcap/followups/synthesized.py b/src/agentcap/followups/synthesized.py deleted file mode 100644 index 3bcf10e..0000000 --- a/src/agentcap/followups/synthesized.py +++ /dev/null @@ -1,127 +0,0 @@ -"""Synthesized follow-up strategy. - -Sends ``(original_task, agent's last response)`` to a small synthesizer -LLM and uses the response as the next user message. - -By design the synthesizer call **bypasses the capture proxy** — it -talks to the model server (or a different endpoint) directly. The -capture must remain a clean record of agent↔model interaction; -the synthesizer is just a way to produce realistic next user inputs. -""" - -from __future__ import annotations - -import json -import sys -from typing import Callable - -from . import FollowUp - - -PROMPT_TEMPLATE = """\ -You are a developer interacting with a coding agent. Given the agent's -last response, produce ONE short follow-up question or instruction -(<=30 words) that pushes the conversation forward. Don't ask the -agent to summarise; ask it to do or show something. - -Original task: -<<<{task}>>> - -Agent's last response: -<<<{response}>>> - -Follow-up: -""" - - -def _default_call_synth( - *, - upstream: str, - model: str, - prompt: str, - timeout: float | None, - api_key: str | None = None, -) -> str: - """Default OpenAI-compat chat-completion call.""" - import httpx - - body = { - "model": model, - "messages": [{"role": "user", "content": prompt}], - # Reason-by-default models (Gemma-4, Qwen3.5+) burn the budget - # in reasoning_content before the answer; an 80-token cap was - # silently producing empty content + finish_reason="length". - "max_tokens": 2048, - "temperature": 0.7, - } - base = upstream.rstrip("/") - if base.endswith("/v1"): - url = base + "/chat/completions" - else: - url = base + "/v1/chat/completions" - headers = {"Authorization": f"Bearer {api_key}"} if api_key else None - resp = httpx.post(url, json=body, timeout=timeout, headers=headers) - resp.raise_for_status() - data = resp.json() - try: - return data["choices"][0]["message"]["content"].strip() - except (KeyError, IndexError, TypeError) as exc: - raise RuntimeError( - f"synthesizer response missing choices[0].message.content: " - f"{json.dumps(data)[:200]}" - ) from exc - - -class SynthesizedFollowUp(FollowUp): - name = "synthesized" - - def __init__( - self, - *, - upstream: str, - model: str, - timeout: float | None = 60, - call: Callable[..., str] | None = None, - prompt_template: str = PROMPT_TEMPLATE, - fallback: str = "continue", - api_key: str | None = None, - ) -> None: - """``upstream`` should point at the model server **directly**, - not at the capture proxy. ``call`` is overridable for tests. - ``api_key`` is forwarded as ``Authorization: Bearer …`` on each - synthesizer call — required for authenticated upstreams like - the HF Router.""" - self.upstream = upstream - self.model = model - self.timeout = timeout - self._call = call or _default_call_synth - self.prompt_template = prompt_template - self.fallback = fallback - self.api_key = api_key - - def next(self, *, original_task: str, last_response: str, turn: int) -> str: - prompt = self.prompt_template.format( - task=original_task, response=last_response - ) - try: - text = self._call( - upstream=self.upstream, - model=self.model, - prompt=prompt, - timeout=self.timeout, - api_key=self.api_key, - ) - except Exception as exc: - # Silence here used to mask 401s against authenticated upstreams, - # making the whole sweep produce ``continue`` follow-ups while - # ``run.json`` still claimed ``followup: synthesized``. - msg = " ".join(str(exc).splitlines()) - print( - f"[followups] synthesized turn={turn} fell back to " - f"{self.fallback!r}: {type(exc).__name__}: {msg}", - file=sys.stderr, - flush=True, - ) - return self.fallback - text = text.strip() - return text or self.fallback diff --git a/src/agentcap/followups/templates.py b/src/agentcap/followups/templates.py deleted file mode 100644 index 4277ec4..0000000 --- a/src/agentcap/followups/templates.py +++ /dev/null @@ -1,28 +0,0 @@ -"""Rotating-template follow-up strategy. - -Cycles through a small fixed pool. No extra inference cost; minor -variation in user-message tokens compared to plain ``continue``. -""" - -from __future__ import annotations - -from typing import Sequence - -from . import FollowUp - - -_DEFAULT_POOL = ("continue", "go on", "what else?", "keep going") - - -class TemplatesFollowUp(FollowUp): - name = "templates" - - def __init__(self, pool: Sequence[str] = _DEFAULT_POOL) -> None: - if not pool: - raise ValueError("templates pool must be non-empty") - self.pool = list(pool) - - def next(self, *, original_task: str, last_response: str, turn: int) -> str: - # turn=2 (first follow-up) → pool[0]; turn=3 → pool[1]; etc. - idx = (turn - 2) % len(self.pool) - return self.pool[idx] diff --git a/src/agentcap/orchestrator.py b/src/agentcap/orchestrator.py deleted file mode 100644 index 9d6bc1f..0000000 --- a/src/agentcap/orchestrator.py +++ /dev/null @@ -1,220 +0,0 @@ -"""Drive an agent CLI through a corpus of prompts. - -The orchestrator pairs an :class:`AgentDriver` with a :class:`FollowUp` -strategy and steps each task through ``turns_per_task`` turns. The -proxy that captures the actual chat-completion bytes is configured -separately (started before the orchestrator runs and pointed at via -the agent's own config); this module is intentionally proxy-agnostic. - -Per-turn driver stdout/stderr is written under -``/task__turn_.{out,err}`` for debugging. The -orchestrator's primary output is the list of :class:`TaskResult` -objects returned by :meth:`Orchestrator.run_corpus`. -""" - -from __future__ import annotations - -import subprocess -import time -from dataclasses import dataclass, field -from pathlib import Path -from typing import Callable, Iterable, Sequence - -from .drivers import AgentDriver, AgentTurn -from .followups import FollowUp - - -@dataclass -class TaskTurnResult: - turn: int # 1-indexed - prompt: str - session_id: str | None - returncode: int - response_text: str - duration_s: float - - -@dataclass -class TaskResult: - task_id: str - prompt: str - turns: list[TaskTurnResult] = field(default_factory=list) - - @property - def session_id(self) -> str | None: - if self.turns: - return self.turns[0].session_id - return None - - @property - def completed_turns(self) -> int: - return sum(1 for t in self.turns if t.returncode == 0) - - -def read_tasks_txt(path: Path | str) -> list[str]: - """Read a plain-text tasks file (one prompt per line, ``#`` comments - and blank lines ignored).""" - text = Path(path).read_text() - out: list[str] = [] - for line in text.splitlines(): - s = line.strip() - if not s or s.startswith("#"): - continue - out.append(s) - return out - - -class Orchestrator: - """Run a corpus through an agent driver with a follow-up strategy.""" - - def __init__( - self, - driver: AgentDriver, - followup: FollowUp, - *, - sessions_dir: Path | str | None = None, - set_capture_context: Callable[..., None] | None = None, - on_event: Callable[..., None] | None = None, - ) -> None: - self.driver = driver - self.followup = followup - self.sessions_dir = Path(sessions_dir) if sessions_dir else None - self.set_capture_context = set_capture_context or (lambda **_: None) - if self.sessions_dir is not None: - self.sessions_dir.mkdir(parents=True, exist_ok=True) - self.on_event = on_event or (lambda **_: None) - - def _log_turn(self, task_id: str, turn: int, agent_turn: AgentTurn) -> None: - if self.sessions_dir is None: - return - base = self.sessions_dir / f"{task_id}_turn_{turn:02d}" - base.with_suffix(".out").write_text(agent_turn.stdout) - base.with_suffix(".err").write_text(agent_turn.stderr) - - def run_task( - self, - prompt: str, - *, - task_id: str, - turns: int, - timeout: float | None = None, - ) -> TaskResult: - if turns < 1: - raise ValueError("turns must be >= 1") - - result = TaskResult(task_id=task_id, prompt=prompt) - - # Turn 1: open session - self.on_event(event="task_start", task_id=task_id, prompt=prompt, turns=turns) - self.set_capture_context(task_id=task_id, turn=1) - t0 = time.time() - try: - first = self.driver.start(prompt, timeout=timeout) - except subprocess.TimeoutExpired: - dur = time.time() - t0 - self.on_event( - event="task_aborted", - task_id=task_id, - reason="initial-turn-timeout", - duration_s=dur, - ) - return result - dur = time.time() - t0 - result.turns.append( - TaskTurnResult( - turn=1, - prompt=prompt, - session_id=first.session_id, - returncode=first.returncode, - response_text=first.response_text, - duration_s=dur, - ) - ) - self._log_turn(task_id, 1, first) - self.on_event( - event="turn_done", - task_id=task_id, - turn=1, - session_id=first.session_id, - returncode=first.returncode, - duration_s=dur, - ) - - if first.returncode != 0: - self.on_event(event="task_aborted", task_id=task_id, reason="initial-turn-failed") - return result - if first.session_id is None and turns > 1: - self.on_event(event="task_aborted", task_id=task_id, reason="no-session-id") - return result - - # Follow-up turns - last_response = first.response_text - sid = first.session_id - for turn in range(2, turns + 1): - next_prompt = self.followup.next( - original_task=prompt, last_response=last_response, turn=turn - ) - self.set_capture_context(task_id=task_id, turn=turn) - t0 = time.time() - try: - fu = self.driver.resume(next_prompt, session_id=sid, timeout=timeout) - except NotImplementedError: - self.on_event( - event="task_aborted", - task_id=task_id, - reason="resume-not-supported", - ) - break - except subprocess.TimeoutExpired: - dur = time.time() - t0 - self.on_event( - event="task_aborted", - task_id=task_id, - reason="follow-up-turn-timeout", - turn=turn, - duration_s=dur, - ) - break - dur = time.time() - t0 - result.turns.append( - TaskTurnResult( - turn=turn, - prompt=next_prompt, - session_id=sid, - returncode=fu.returncode, - response_text=fu.response_text, - duration_s=dur, - ) - ) - self._log_turn(task_id, turn, fu) - self.on_event( - event="turn_done", - task_id=task_id, - turn=turn, - session_id=sid, - returncode=fu.returncode, - duration_s=dur, - ) - if fu.returncode != 0: - break - last_response = fu.response_text - - return result - - def run_corpus( - self, - tasks: Sequence[str] | Iterable[str], - *, - turns_per_task: int, - timeout: float | None = None, - task_id_format: str = "task_{i:02d}", - ) -> list[TaskResult]: - results: list[TaskResult] = [] - for i, prompt in enumerate(tasks, start=1): - tid = task_id_format.format(i=i) - results.append( - self.run_task( - prompt, task_id=tid, turns=turns_per_task, timeout=timeout - ) - ) - return results diff --git a/src/agentcap/provider.py b/src/agentcap/provider.py deleted file mode 100644 index afdcd5b..0000000 --- a/src/agentcap/provider.py +++ /dev/null @@ -1,144 +0,0 @@ -"""Identify the inference backend behind an upstream URL. - -Hostname classification (:func:`_hostname_fallback`) + -HF Router sub-provider pin (:func:`refine_for_sub_provider`). -:func:`probe` is the richer (network) variant — issues parallel -GETs to well-known introspection endpoints, never raises. -""" - -from __future__ import annotations - -import concurrent.futures -import ipaddress -import time -from typing import Any -from urllib.parse import urlparse - -import httpx - - -# Reverse proxies / custom domains won't match; the probe path catches those. -_HOSTNAME_TO_PROVIDER: dict[str, str] = { - "router.huggingface.co": "hf-router", - "api.openai.com": "openai", - "api.together.xyz": "together", - "api.anthropic.com": "anthropic", - "api.cerebras.ai": "cerebras", - "api.fireworks.ai": "fireworks", - "api.groq.com": "groq", -} - - -def _base_root(upstream_url: str) -> str: - # Introspection endpoints (/props, /info, ...) live under the - # server root, not /v1. - base = upstream_url.rstrip("/") - if base.endswith("/v1"): - base = base[:-3] - return base - - -def _hostname_fallback(upstream_url: str) -> str: - host = (urlparse(upstream_url).hostname or "").lower() - if not host: - return "unknown" - if host in _HOSTNAME_TO_PROVIDER: - return _HOSTNAME_TO_PROVIDER[host] - if host in ("localhost", "::1"): - return "local" - try: - ip = ipaddress.ip_address(host) - return "local" if (ip.is_loopback or ip.is_private) else host - except ValueError: - pass - parts = host.split(".") - return parts[-2] if len(parts) >= 2 else host - - -def _try_get(url: str, headers: dict, timeout: float) -> dict | None: - try: - r = httpx.get(url, headers=headers, timeout=timeout) - except (httpx.HTTPError, OSError): - return None - if r.status_code != 200: - return None - ct = r.headers.get("content-type", "") - out: dict[str, Any] = {"headers": {k.lower(): v for k, v in r.headers.items()}} - try: - if "json" in ct: - out["body"] = r.json() - else: - out["text"] = r.text[:4096] - except Exception: - return None - return out - - -def probe( - upstream_url: str, - *, - api_key: str | None = None, - timeout: float = 3.0, -) -> dict: - """Probe an OpenAI-compat upstream. Never raises.""" - root = _base_root(upstream_url) - headers = {"Authorization": f"Bearer {api_key}"} if api_key else {} - targets = { - "props": f"{root}/props", # llama.cpp - "info": f"{root}/info", # TGI - "version": f"{root}/version", # vLLM - "models": f"{root}/v1/models", - "metrics": f"{root}/metrics", - } - endpoints: dict[str, dict] = {} - with concurrent.futures.ThreadPoolExecutor(max_workers=len(targets)) as pool: - futures = { - name: pool.submit(_try_get, url, headers, timeout) - for name, url in targets.items() - } - for name, fut in futures.items(): - try: - res = fut.result(timeout=timeout + 1.0) - except concurrent.futures.TimeoutError: - res = None - if res is not None: - endpoints[name] = res - - return { - "upstream_url": upstream_url, - "provider": _classify(endpoints, upstream_url), - "probed_at": int(time.time()), - "endpoints": endpoints, - } - - -def _classify(endpoints: dict, upstream_url: str) -> str: - models_body = (endpoints.get("models") or {}).get("body") or {} - model_ids = [m.get("id", "") for m in (models_body.get("data") or [])] - - # HF Router model ids carry a ``:`` suffix. - if any(":" in i for i in model_ids): - return "hf-router" - if endpoints.get("props") is not None: - return "local-llama-server" - - info_body = (endpoints.get("info") or {}).get("body") or {} - if isinstance(info_body, dict) and info_body.get("model_id"): - return "tgi" - - version_body = (endpoints.get("version") or {}).get("body") or {} - if isinstance(version_body, dict) and version_body.get("version"): - return "vllm" - - if any(i.startswith(("gpt-", "o1-", "o3-", "o4-")) for i in model_ids): - return "openai" - - return _hostname_fallback(upstream_url) - - -def refine_for_sub_provider(provider: str, model: str | None) -> str: - """Surface HF Router's ``meta-llama/...:fireworks-ai`` pin as - ``hf-router/fireworks-ai`` in the provider slug.""" - if provider == "hf-router" and model and ":" in model: - return f"hf-router/{model.split(':', 1)[1]}" - return provider diff --git a/src/agentcap/proxy.py b/src/agentcap/proxy.py deleted file mode 100644 index 60df7e7..0000000 --- a/src/agentcap/proxy.py +++ /dev/null @@ -1,415 +0,0 @@ -"""Capture proxy for OpenAI-compat chat completions. - -Captures ``POST /v1/chat/completions`` to -``/.{request,response}.json``; other paths -pass through. Streaming responses are forwarded chunk-by-chunk and -the assembled bytes persisted at end-of-stream. -""" - -from __future__ import annotations - -import json -import time -import uuid -from pathlib import Path -from typing import Any, AsyncIterator, Optional - -import httpx -from starlette.applications import Starlette -from starlette.requests import Request -from starlette.responses import Response, StreamingResponse -from starlette.routing import Route - - -# Constant so per-agent Containerfiles can bake the proxy URL into -# the agent's config files without per-run rewriting. -IN_PROCESS_PROXY_HOST = "127.0.0.1" -IN_PROCESS_PROXY_PORT = 0 # kernel-assigned ephemeral; read back via ProxyHandle.port - -CHAT_COMPLETIONS_PATH = "/v1/chat/completions" - -# Hop-by-hop (RFC 7230 §6.1) plus content-length / content-encoding -# which the framework recomputes from the re-emitted body. -_HOP_BY_HOP = frozenset({ - "host", - "content-length", - "content-encoding", - "transfer-encoding", - "connection", - "keep-alive", - "proxy-authenticate", - "proxy-authorization", - "te", - "trailers", - "upgrade", -}) - - -def _filter_headers(headers: Any) -> dict[str, str]: - return {k: v for k, v in headers.items() if k.lower() not in _HOP_BY_HOP} - - -def _safe_json_loads(raw: bytes) -> Any: - """Parse JSON; on failure, return a {"raw": } placeholder so - the capture stays well-formed even on malformed input.""" - try: - return json.loads(raw) - except (json.JSONDecodeError, ValueError): - return {"_unparsed_raw": raw.decode("utf-8", errors="replace")} - - -def _lower_headers(headers: Any) -> dict[str, str]: - try: - return {k.lower(): v for k, v in headers.items()} - except AttributeError: - return {} - - -def _extract_model_from_sse(raw: bytes) -> str | None: - """Find a ``"model"`` field in the first parseable SSE data line.""" - for line in raw.splitlines(): - if not line.startswith(b"data:"): - continue - payload = line[len(b"data:"):].strip() - if not payload or payload == b"[DONE]": - continue - try: - obj = json.loads(payload) - except (json.JSONDecodeError, ValueError): - continue - if isinstance(obj, dict): - m = obj.get("model") - if isinstance(m, str) and m: - return m - return None - - -def _response_fingerprint(headers: Any, body_obj: Any) -> dict[str, str | None]: - h = _lower_headers(headers) - served_model: str | None = None - if isinstance(body_obj, dict): - m = body_obj.get("model") - if isinstance(m, str) and m: - served_model = m - return { - "server": h.get("server") or None, - "x_served_by": h.get("x-served-by") or None, - "via": h.get("via") or None, - "build_info": h.get("x-build-info") or None, - "served_model": served_model, - } - - -class CaptureProxy: - """Capture proxy as a Starlette handler bundle. - - Pass a custom ``client`` (typically ``httpx.AsyncClient`` with - ``ASGITransport``) to wire against a mock upstream in tests. - """ - - def __init__( - self, - upstream: str, - capture_dir: Path | str, - *, - client: Optional[httpx.AsyncClient] = None, - ) -> None: - self.upstream = upstream.rstrip("/") - self.capture_dir = Path(capture_dir) - self.capture_dir.mkdir(parents=True, exist_ok=True) - self._client = client - self._owns_client = client is None - # Context the orchestrator sets before each turn — stamped into - # each captured request so rid → (task_id, turn) is recoverable - # from the capture file alone, no sidecar mapping. - self._task_id: str | None = None - self._turn: int | None = None - - def set_context(self, *, task_id: str | None, turn: int | None) -> None: - self._task_id = task_id - self._turn = turn - - async def _get_client(self) -> httpx.AsyncClient: - if self._client is None: - # No timeout: agent calls can be long, agent decides when to give up. - self._client = httpx.AsyncClient(timeout=None) - return self._client - - async def aclose(self) -> None: - if self._client is not None and self._owns_client: - await self._client.aclose() - - def _persist_request(self, request_id: str, body_bytes: bytes, captured_at: int) -> None: - path = self.capture_dir / f"{request_id}.request.json" - record = { - "request_id": request_id, - "captured_at": captured_at, - "upstream_url": self.upstream, - "task_id": self._task_id, - "turn": self._turn, - "body": _safe_json_loads(body_bytes), - } - path.write_text(json.dumps(record, indent=2)) - - def _persist_response_nonstream( - self, - request_id: str, - status_code: int, - body_bytes: bytes, - captured_at: int, - upstream_headers: Any, - ) -> None: - body = _safe_json_loads(body_bytes) - fp = _response_fingerprint(upstream_headers, body) - path = self.capture_dir / f"{request_id}.response.json" - record = { - "request_id": request_id, - "captured_at_resp": captured_at, - "stream": False, - "status_code": status_code, - "body": body, - "upstream_fingerprint": fp, - } - path.write_text(json.dumps(record, indent=2)) - - def _persist_response_stream( - self, - request_id: str, - status_code: int, - raw_bytes: bytes, - captured_at: int, - upstream_headers: Any, - ) -> None: - sse_model = _extract_model_from_sse(raw_bytes) - synthetic_body = {"model": sse_model} if sse_model else None - fp = _response_fingerprint(upstream_headers, synthetic_body) - path = self.capture_dir / f"{request_id}.response.json" - record = { - "request_id": request_id, - "captured_at_resp": captured_at, - "stream": True, - "status_code": status_code, - "raw": raw_bytes.decode("utf-8", errors="replace"), - "upstream_fingerprint": fp, - } - path.write_text(json.dumps(record, indent=2)) - - async def chat_completions(self, request: Request) -> Response: - body_bytes = await request.body() - body_obj = _safe_json_loads(body_bytes) - is_stream = bool(isinstance(body_obj, dict) and body_obj.get("stream", False)) - - request_id = uuid.uuid4().hex - self._persist_request(request_id, body_bytes, int(time.time())) - - url = f"{self.upstream}{CHAT_COMPLETIONS_PATH}" - fwd_headers = _filter_headers(request.headers) - client = await self._get_client() - - if is_stream: - return await self._forward_stream( - client, url, body_bytes, fwd_headers, request_id - ) - return await self._forward_nonstream( - client, url, body_bytes, fwd_headers, request_id - ) - - async def _forward_nonstream( - self, - client: httpx.AsyncClient, - url: str, - body_bytes: bytes, - fwd_headers: dict[str, str], - request_id: str, - ) -> Response: - upstream_resp = await client.post(url, content=body_bytes, headers=fwd_headers) - resp_bytes = upstream_resp.content - self._persist_response_nonstream( - request_id, - upstream_resp.status_code, - resp_bytes, - int(time.time()), - upstream_resp.headers, - ) - return Response( - content=resp_bytes, - status_code=upstream_resp.status_code, - headers=_filter_headers(upstream_resp.headers), - media_type=upstream_resp.headers.get("content-type"), - ) - - async def _forward_stream( - self, - client: httpx.AsyncClient, - url: str, - body_bytes: bytes, - fwd_headers: dict[str, str], - request_id: str, - ) -> StreamingResponse: - # We need the upstream status + content-type before we can - # construct the StreamingResponse. Open the stream eagerly, - # capture metadata, then yield bytes lazily. - async def streamer() -> AsyncIterator[bytes]: - chunks: list[bytes] = [] - status_code = 502 - upstream_headers: Any = {} - try: - async with client.stream( - "POST", url, content=body_bytes, headers=fwd_headers - ) as upstream_resp: - status_code = upstream_resp.status_code - upstream_headers = upstream_resp.headers - async for chunk in upstream_resp.aiter_bytes(): - chunks.append(chunk) - yield chunk - finally: - self._persist_response_stream( - request_id, - status_code, - b"".join(chunks), - int(time.time()), - upstream_headers, - ) - - return StreamingResponse(streamer(), media_type="text/event-stream") - - async def passthrough(self, request: Request) -> Response: - url = f"{self.upstream}{request.url.path}" - if request.url.query: - url = f"{url}?{request.url.query}" - body_bytes = await request.body() - fwd_headers = _filter_headers(request.headers) - client = await self._get_client() - upstream_resp = await client.request( - request.method, - url, - content=body_bytes if body_bytes else None, - headers=fwd_headers, - ) - return Response( - content=upstream_resp.content, - status_code=upstream_resp.status_code, - headers=_filter_headers(upstream_resp.headers), - media_type=upstream_resp.headers.get("content-type"), - ) - - -def make_app( - upstream: str, - capture_dir: Path | str, - *, - client: Optional[httpx.AsyncClient] = None, -) -> Starlette: - """Build the Starlette ASGI app wrapping a CaptureProxy.""" - proxy = CaptureProxy(upstream, capture_dir, client=client) - routes = [ - Route(CHAT_COMPLETIONS_PATH, proxy.chat_completions, methods=["POST"]), - Route( - "/{full_path:path}", - proxy.passthrough, - methods=["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS"], - ), - ] - - from contextlib import asynccontextmanager - - @asynccontextmanager - async def lifespan(app: Starlette): - try: - yield - finally: - await proxy.aclose() - - app = Starlette(routes=routes, lifespan=lifespan) - app.state.proxy = proxy - return app - - -def serve( - upstream: str, - capture_dir: Path | str, - host: str = "127.0.0.1", - port: int = 8001, -) -> None: - import uvicorn - - app = make_app(upstream, capture_dir) - uvicorn.run(app, host=host, port=port) - - -class ProxyHandle: - """Running in-process proxy. Use as a context manager.""" - - def __init__( - self, server, thread, host: str, port: int, - proxy: CaptureProxy, - ) -> None: - self._server = server - self._thread = thread - self.host = host - self.port = port - self.proxy = proxy - - def set_context(self, *, task_id: str | None, turn: int | None) -> None: - """Forward to the underlying ``CaptureProxy`` so subsequent - captures are stamped with the given orchestrator-turn context.""" - self.proxy.set_context(task_id=task_id, turn=turn) - - @property - def base_url(self) -> str: - return f"http://{self.host}:{self.port}" - - def shutdown(self, *, timeout: float = 10) -> None: - self._server.should_exit = True - self._thread.join(timeout=timeout) - - def __enter__(self) -> "ProxyHandle": - return self - - def __exit__(self, *exc) -> None: - self.shutdown() - - -def serve_in_thread( - upstream: str, - capture_dir: Path | str, - host: str = IN_PROCESS_PROXY_HOST, - port: int = IN_PROCESS_PROXY_PORT, - *, - log_level: str = "warning", - startup_timeout: float = 10.0, -) -> ProxyHandle: - """Start the proxy on a daemon thread; block until uvicorn is bound. - - With ``port=0`` the kernel-assigned port is read back into - ``ProxyHandle.port``. - """ - import threading - import time - - import uvicorn - - app = make_app(upstream, capture_dir) - config = uvicorn.Config(app, host=host, port=port, log_level=log_level) - server = uvicorn.Server(config) - - thread = threading.Thread(target=server.run, daemon=True) - thread.start() - - deadline = time.time() + startup_timeout - while not server.started: - if time.time() > deadline: - server.should_exit = True - thread.join(timeout=2) - raise RuntimeError( - f"proxy did not start within {startup_timeout}s on {host}:{port}" - ) - time.sleep(0.05) - - bound_host, bound_port = host, port - try: - bound_host, bound_port = server.servers[0].sockets[0].getsockname()[:2] - except (AttributeError, IndexError, TypeError): - pass - - return ProxyHandle(server, thread, bound_host, bound_port, proxy=app.state.proxy) diff --git a/src/agentcap/sandbox/__init__.py b/src/agentcap/sandbox/__init__.py deleted file mode 100644 index 80a400a..0000000 --- a/src/agentcap/sandbox/__init__.py +++ /dev/null @@ -1,116 +0,0 @@ -"""Filesystem / network sandbox for capture-run subprocesses. - -Single implementation: each ``run()`` is an ephemeral -``podman run --rm`` against the per-agent image built from -``containers/agentcap-.Containerfile``. The agent CLI lives -inside the image, never on the host. -""" - -from __future__ import annotations - -import platform -import shutil -import subprocess -import sys -from pathlib import Path -from typing import Protocol, runtime_checkable - - -@runtime_checkable -class Sandbox(Protocol): - """Paths returned by :meth:`mkdtemp` and consumed by - :meth:`write_text` / :meth:`read_text` are host paths bind-mounted - into the agent's view at the same path.""" - - name: str - - def wrap( - self, - argv: list[str], - *, - writable_paths: list[Path], - deny_network: bool = False, - ) -> list[str]: - ... - - def run( - self, - argv: list[str], - *, - env: dict[str, str] | None = None, - cwd: str | None = None, - writable_paths: list[Path] | None = None, - deny_network: bool = False, - timeout: float | None = None, - check: bool = False, - ) -> subprocess.CompletedProcess: - ... - - def mkdtemp(self, prefix: str = "agentcap-") -> str: ... - def rmtree(self, path: str) -> None: ... - def write_text(self, path: str, content: str) -> None: ... - def read_text(self, path: str) -> str: ... - - -def get_sandbox( - *, - agent: str, - env: dict[str, str] | None = None, - readonly_paths: list[Path] | None = None, - writable_paths: list[Path] | None = None, -) -> Sandbox: - """Return a sandbox handle for ``agent``. Pure: does not build - the image. Call :func:`require_sandbox_or_die` to provision.""" - from .podman import PodmanSandbox - from .podman_provisioning import image_tag - return PodmanSandbox( - image=image_tag(agent), env=env, - readonly_paths=readonly_paths, - writable_paths=writable_paths, - ) - - -def require_sandbox_or_die( - *, - agent: str, - command: str = "agentcap run", - log=lambda msg: None, - env: dict[str, str] | None = None, - readonly_paths: list[Path] | None = None, - writable_paths: list[Path] | None = None, -) -> "Sandbox": - """Return a sandbox handle, or exit 2 with an install hint. - Triggers an image build on first use.""" - system = platform.system() - if system not in ("Linux", "Darwin"): - sys.stderr.write( - f"{command}: agentcap sandboxing is only supported on " - f"Linux and macOS; host is {system!r}.\n" - ) - sys.exit(2) - if not shutil.which("podman"): - sys.stderr.write( - f"{command}: podman is required.\n" - " Install with: brew install podman (macOS) " - "or apt install podman (Linux)\n" - ) - sys.exit(2) - from .podman_provisioning import ensure_image, ensure_machine_running - try: - ensure_machine_running(log=log) - ensure_image(agent, log=log) - except (FileNotFoundError, RuntimeError) as exc: - sys.stderr.write(f"{command}: {exc}\n") - sys.exit(2) - return get_sandbox( - agent=agent, env=env, - readonly_paths=readonly_paths, - writable_paths=writable_paths, - ) - - -__all__ = [ - "Sandbox", - "get_sandbox", - "require_sandbox_or_die", -] diff --git a/src/agentcap/sandbox/podman.py b/src/agentcap/sandbox/podman.py deleted file mode 100644 index 3072412..0000000 --- a/src/agentcap/sandbox/podman.py +++ /dev/null @@ -1,189 +0,0 @@ -"""Podman container sandbox. - -Each ``run()`` is a fresh ``podman run --rm`` against a pre-built -image. Host paths in ``writable_paths`` / ``readonly_paths`` are -bind-mounted into the container at the same path so the agent sees -identical paths inside and outside. - -The image is *not* built here — callers must ensure it exists in the -local podman image store before constructing the sandbox. -""" - -from __future__ import annotations - -import shutil -import subprocess -import tempfile -from pathlib import Path - - -_PODMAN = "podman" - - -def build_command( - argv: list[str], - *, - image: str, - writable_paths: list[Path], - readonly_paths: list[Path] | None = None, - deny_network: bool = False, - env: dict[str, str] | None = None, - cwd: str | None = None, -) -> list[str]: - """Assemble a ``podman run --rm ... `` invocation.""" - cmd = [_PODMAN, "run", "--rm"] - if deny_network: - cmd.append("--network=none") - if cwd is not None: - cmd.extend(["--workdir", str(cwd)]) - - bound: set[str] = set() - all_writable = list(writable_paths) - if cwd is not None: - all_writable.append(Path(cwd)) - for p in all_writable: - resolved = str(Path(p).resolve()) - if resolved in bound: - continue - bound.add(resolved) - cmd.extend(["--mount", f"type=bind,src={resolved},dst={resolved}"]) - for p in readonly_paths or []: - resolved = str(Path(p).resolve()) - if resolved in bound: - continue - bound.add(resolved) - cmd.extend(["--mount", f"type=bind,src={resolved},dst={resolved},ro"]) - - for k, v in (env or {}).items(): - cmd.extend(["-e", f"{k}={v}"]) - - cmd.append(image) - cmd.extend(argv) - return cmd - - -class PodmanSandbox: - """Image-based sandbox using ``podman run --rm``. - - The image holds the agent CLI + deps; nothing on the host is - visible inside the container except paths the driver explicitly - passes via ``writable_paths`` / ``readonly_paths``. - """ - - name = "podman" - - def __init__( - self, - image: str, - *, - env: dict[str, str] | None = None, - readonly_paths: list[Path] | None = None, - writable_paths: list[Path] | None = None, - ) -> None: - self.image = image - self._extra_env: dict[str, str] = dict(env or {}) - self._readonly_paths: list[Path] = list(readonly_paths or []) - self._writable_paths: list[Path] = list(writable_paths or []) - - def close(self) -> None: - """No-op. Each ``run()`` produces an ephemeral container.""" - - def __enter__(self) -> "PodmanSandbox": - return self - - def __exit__(self, *_exc) -> None: - self.close() - - def wrap( - self, - argv: list[str], - *, - writable_paths: list[Path], - deny_network: bool = False, - env: dict[str, str] | None = None, - cwd: str | None = None, - ) -> list[str]: - full_env = dict(self._extra_env) - if env: - full_env.update(env) - return build_command( - argv, - image=self.image, - writable_paths=list(writable_paths) + self._writable_paths, - readonly_paths=self._readonly_paths, - deny_network=deny_network, - env=full_env, - cwd=cwd, - ) - - def run( - self, - argv: list[str], - *, - env: dict[str, str] | None = None, - cwd: str | None = None, - writable_paths: list[Path] | None = None, - deny_network: bool = False, - timeout: float | None = None, - check: bool = False, - ) -> subprocess.CompletedProcess: - wrapped = self.wrap( - argv, - writable_paths=writable_paths or [], - deny_network=deny_network, - env=env, - cwd=cwd, - ) - # ``--rm`` only fires on a clean container exit; if the orchestrator - # is killed, times out, or the parent process dies before the - # container does, the container is orphaned and its overlay layer - # accumulates in the podman VM. Tag every invocation with a unique - # ``--name`` so a ``finally`` can force-remove it no matter how - # ``subprocess.run`` returned. - import uuid - name = f"agentcap-{uuid.uuid4().hex[:12]}" - wrapped.insert(2, "--name") - wrapped.insert(3, name) - try: - return subprocess.run( - wrapped, - stdin=subprocess.DEVNULL, - capture_output=True, text=True, - timeout=timeout, check=check, - ) - finally: - # Cleanup is best-effort: if it raises (timeout, podman - # missing, etc.) we must NOT shadow the primary outcome of - # ``run()`` — turning a successful container exit into a - # cleanup failure (or hiding the real subprocess error - # behind a generic rm failure) would surprise every caller. - try: - subprocess.run( - [_PODMAN, "rm", "-f", name], - stdin=subprocess.DEVNULL, - capture_output=True, text=True, - timeout=30, - ) - except Exception: # noqa: BLE001 - pass - - @staticmethod - def _runs_dir() -> Path: - d = Path.home() / ".cache" / "agentcap" / "runs" - d.mkdir(parents=True, exist_ok=True) - return d - - def mkdtemp(self, prefix: str = "agentcap-") -> str: - return tempfile.mkdtemp(prefix=prefix, dir=str(self._runs_dir())) - - def rmtree(self, path: str) -> None: - shutil.rmtree(path, ignore_errors=True) - - def write_text(self, path: str, content: str) -> None: - Path(path).write_text(content) - - def read_text(self, path: str) -> str: - return Path(path).read_text() - - -__all__ = ["PodmanSandbox", "build_command"] diff --git a/src/agentcap/sandbox/podman_provisioning.py b/src/agentcap/sandbox/podman_provisioning.py deleted file mode 100644 index 7b9580e..0000000 --- a/src/agentcap/sandbox/podman_provisioning.py +++ /dev/null @@ -1,207 +0,0 @@ -"""Per-agent podman image lifecycle: ``ensure_image`` for -``agentcap run`` and the pytest fixture both. - -The Containerfile is the source of truth: its SHA256 is baked into -the built image as a label, and a hash mismatch on subsequent runs -forces a rebuild. -""" - -from __future__ import annotations - -import hashlib -import json -import platform -import shutil -import subprocess -from pathlib import Path - -_CONTAINERFILE_DIR = ( - Path(__file__).resolve().parents[3] / "containers" -) - -_HASH_LABEL = "agentcap.containerfile-hash" - - -def containerfile_path(agent: str) -> Path: - return _CONTAINERFILE_DIR / f"agentcap-{agent}.Containerfile" - - -def image_tag(agent: str) -> str: - return f"localhost/agentcap-{agent}:latest" - - -def _containerfile_hash(path: Path) -> str: - h = hashlib.sha256() - h.update(path.read_bytes()) - name = path.stem - ctx = path.parent / name - if ctx.is_dir(): - for f in sorted(ctx.rglob("*")): - if f.is_file(): - h.update(str(f.relative_to(ctx)).encode()) - h.update(b"\0") - h.update(f.read_bytes()) - h.update(b"\0") - return h.hexdigest() - - -def _image_info(tag: str) -> dict | None: - if not shutil.which("podman"): - return None - r = subprocess.run( - ["podman", "image", "inspect", tag], - capture_output=True, text=True, - ) - if r.returncode != 0: - return None - try: - info = json.loads(r.stdout) - except json.JSONDecodeError: - return None - return info[0] if isinstance(info, list) and info else None - - -def _image_stored_hash(info: dict) -> str | None: - labels = (info.get("Labels") or info.get("Config", {}).get("Labels")) or {} - return labels.get(_HASH_LABEL) - - -def _image_is_current(tag: str, cf: Path) -> bool: - info = _image_info(tag) - if info is None: - return False - stored = _image_stored_hash(info) - return stored is not None and stored == _containerfile_hash(cf) - - -def ensure_image( - agent: str, - *, - log=lambda msg: None, -) -> str: - """Build the per-agent podman image from the Containerfile if - absent or stale; return the image tag. - - Raises ``FileNotFoundError`` if the Containerfile is missing, - ``RuntimeError`` if ``podman`` isn't installed or the build fails. - """ - if not shutil.which("podman"): - raise RuntimeError( - "podman not on $PATH (brew install podman / apt install podman)" - ) - cf = containerfile_path(agent) - if not cf.is_file(): - raise FileNotFoundError(f"Containerfile not found: {cf}") - tag = image_tag(agent) - - if _image_is_current(tag, cf): - log(f"{tag} ready (Containerfile hash match)") - return tag - - if _image_info(tag) is not None: - log(f"{tag} is stale; rebuilding…") - subprocess.run( - ["podman", "rmi", "--force", tag], - capture_output=True, text=True, check=False, - ) - else: - log(f"{tag} not built; building (cold build can take minutes)…") - - cf_hash = _containerfile_hash(cf) - r = subprocess.run( - [ - "podman", "build", - "-f", str(cf), - "-t", tag, - "--label", f"{_HASH_LABEL}={cf_hash}", - str(cf.parent), - ], - timeout=1800, - ) - if r.returncode != 0: - raise RuntimeError( - f"podman build failed for {tag} (rc={r.returncode}); " - f"see streamed output above." - ) - log(f"{tag} built") - return tag - - -def rmi_image(tag: str) -> None: - subprocess.run( - ["podman", "rmi", "--force", tag], - capture_output=True, text=True, timeout=60, check=False, - ) - - -def _machine_status() -> str | None: - """Return the status (``Running`` / ``Stopped`` / ``Starting`` / - ...) of the default podman machine, or ``None`` if no machine - exists.""" - if not shutil.which("podman"): - return None - r = subprocess.run( - ["podman", "machine", "list", "--format", "json"], - capture_output=True, text=True, - ) - if r.returncode != 0: - return None - try: - machines = json.loads(r.stdout) - except json.JSONDecodeError: - return None - if not machines: - return None - default = next( - (m for m in machines if m.get("Default")), machines[0], - ) - if default.get("Running"): - return "Running" - if default.get("Starting"): - return "Starting" - return "Stopped" - - -def ensure_machine_running(*, log=lambda msg: None) -> None: - """macOS only: ensure ``podman machine`` is up. No-op on Linux, - where podman talks to the host kernel directly. - - Never auto-initialises the machine — that's a 1-2 GB download - and a multi-minute operation the user should consent to. Raises - ``RuntimeError`` if podman isn't installed, no machine exists, - or the machine can't be started. - """ - if platform.system() != "Darwin": - return - if not shutil.which("podman"): - raise RuntimeError( - "podman not on $PATH (brew install podman)" - ) - status = _machine_status() - if status is None: - raise RuntimeError( - "no podman machine found. Initialise one first:\n" - " podman machine init\n" - " podman machine start" - ) - if status == "Running": - return - log(f"podman machine is {status}; starting…") - r = subprocess.run( - ["podman", "machine", "start"], - capture_output=True, text=True, timeout=300, - ) - if r.returncode != 0: - raise RuntimeError( - f"podman machine start failed (rc={r.returncode}): " - f"{r.stderr.strip()}" - ) - - -__all__ = [ - "containerfile_path", - "ensure_image", - "ensure_machine_running", - "image_tag", - "rmi_image", -] diff --git a/src/agentcap/scan.py b/src/agentcap/scan.py deleted file mode 100644 index dc4e86d..0000000 --- a/src/agentcap/scan.py +++ /dev/null @@ -1,247 +0,0 @@ -"""Secret scan over a capture run, gating ``agentcap export``. - -Shells out to `trufflehog filesystem` and parses its JSON output. -Captures and traces are scanned as plain text (JSON / JSONL); the -parquet repackaging happens after the scan, so we always check the -unpacked source. - -Policy: a single ``verified`` hit aborts the export. ``unverified`` -hits are reported but do not block — TruffleHog's pattern matchers -have a real false-positive rate (e.g. a 32-char alphanumeric in a -model response looks like a Box OAuth token), and we don't have -verification credentials for most providers. - -Scan results are persisted to ``/scan.json`` so subsequent -``agentcap export`` invocations skip the (sometimes slow) verify -step. The cache is invalidated when the user passes ``--rescan`` or -when the recorded ``no_verification`` mode doesn't match the -requested mode (an unverified cache can't satisfy a verified -request). -""" - -from __future__ import annotations - -import json -import os -import shutil -import subprocess -import time -from dataclasses import asdict, dataclass, field -from pathlib import Path - - -@dataclass -class ScanHit: - detector: str - file: str - verified: bool - raw: str # redacted-by-Trufflehog "Raw" field, kept for context - - -@dataclass -class ScanResult: - bytes_scanned: int = 0 - chunks_scanned: int = 0 - verified: list[ScanHit] = field(default_factory=list) - unverified: list[ScanHit] = field(default_factory=list) - - -class TrufflehogMissingError(RuntimeError): - """``trufflehog`` is not on PATH (and not in ~/.local/bin).""" - - -_INSTALL_HINT = ( - "trufflehog is required for the pre-export secret scan but was not " - "found on PATH. Install with:\n" - " curl -sSfL https://raw.githubusercontent.com/trufflesecurity/" - "trufflehog/main/scripts/install.sh | sh -s -- -b ~/.local/bin\n" - "Or pass --no-scan to ``agentcap export`` to skip the scan." -) - - -def find_trufflehog() -> str: - """Locate the ``trufflehog`` binary. Checks PATH then - ``~/.local/bin`` (the installer's default target). - Raises :class:`TrufflehogMissingError` if not found.""" - on_path = shutil.which("trufflehog") - if on_path: - return on_path - local = Path.home() / ".local" / "bin" / "trufflehog" - if local.is_file() and os.access(local, os.X_OK): - return str(local) - raise TrufflehogMissingError(_INSTALL_HINT) - - -def scan_path( - path: Path | str, - *, - no_verification: bool = False, - extra_args: tuple[str, ...] = (), -) -> ScanResult: - """Scan ``path`` (a directory or file) with trufflehog. - - ``no_verification=False`` (the default) round-trips every - candidate against the provider's API (Stripe, AWS, GitHub, HF, …) - so the ``verified`` bucket is high-precision. Requires network. - Pass ``True`` for offline pattern-only matching — faster but - everything lands as ``unverified``. - """ - bin_path = find_trufflehog() - argv = [ - bin_path, "filesystem", str(path), - "--json", "--no-color", - "--results=verified,unverified", - ] - if no_verification: - argv.append("--no-verification") - argv.extend(extra_args) - - proc = subprocess.run( - argv, capture_output=True, text=True, check=False, - ) - - result = ScanResult() - for line in proc.stdout.splitlines(): - if not line.strip(): - continue - try: - rec = json.loads(line) - except json.JSONDecodeError: - continue - if "DetectorName" not in rec: - continue - hit = ScanHit( - detector=rec.get("DetectorName") or "?", - file=( - rec.get("SourceMetadata", {}) - .get("Data", {}) - .get("Filesystem", {}) - .get("file") or "?" - ), - verified=bool(rec.get("Verified")), - raw=str(rec.get("Raw") or "")[:80], - ) - (result.verified if hit.verified else result.unverified).append(hit) - - # The summary line on stderr looks like: - # ... finished scanning {"chunks":..., "bytes":..., "verified_secrets":..., "unverified_secrets":...} - # Parse what we can; the per-hit list above is authoritative. - for line in proc.stderr.splitlines(): - if "finished scanning" not in line: - continue - brace = line.find("{") - if brace < 0: - continue - try: - stats = json.loads(line[brace:]) - except json.JSONDecodeError: - continue - result.bytes_scanned = int(stats.get("bytes", 0)) - result.chunks_scanned = int(stats.get("chunks", 0)) - break - - return result - - -SCAN_CACHE_NAME = "scan.json" - - -def _result_to_dict(result: ScanResult, *, no_verification: bool) -> dict: - return { - "scanned_at": int(time.time()), - "no_verification": no_verification, - "bytes_scanned": result.bytes_scanned, - "chunks_scanned": result.chunks_scanned, - "verified": [asdict(h) for h in result.verified], - "unverified": [asdict(h) for h in result.unverified], - } - - -def _result_from_dict(d: dict) -> ScanResult: - return ScanResult( - bytes_scanned=int(d.get("bytes_scanned") or 0), - chunks_scanned=int(d.get("chunks_scanned") or 0), - verified=[ScanHit(**h) for h in (d.get("verified") or [])], - unverified=[ScanHit(**h) for h in (d.get("unverified") or [])], - ) - - -def load_cached_scan( - run_dir: Path | str, *, no_verification: bool, -) -> ScanResult | None: - """Return a previously persisted scan if it covers the requested - verification mode. A cache produced with ``no_verification=True`` - cannot satisfy a ``no_verification=False`` request (the verified - bucket would be unsound), so we re-scan in that direction. - Returns ``None`` when no usable cache exists.""" - cache_path = Path(run_dir) / SCAN_CACHE_NAME - if not cache_path.is_file(): - return None - try: - d = json.loads(cache_path.read_text()) - except (OSError, json.JSONDecodeError): - return None - cached_no_verify = bool(d.get("no_verification", True)) - if cached_no_verify and not no_verification: - # Want verified results; cache only has patterns. - return None - return _result_from_dict(d) - - -_SCAN_SUBDIRS = ("captures", "traces", "sessions") - - -def scan_run_dir( - run_dir: Path | str, - *, - no_verification: bool = False, - rescan: bool = False, -) -> tuple[ScanResult, bool]: - """Scan a run dir, persisting the result to ``/scan.json`` - for cheap reuse. Returns ``(result, was_cached)``. - - Scans the three subdirs that can hold user/agent text — captures, - traces, and sessions — and skips top-level files like - ``run.json`` and the cache itself. ``rescan=True`` ignores any - persisted result and re-runs trufflehog. Otherwise the cache is - used when it covers the requested mode.""" - run_dir = Path(run_dir) - if not rescan: - cached = load_cached_scan(run_dir, no_verification=no_verification) - if cached is not None: - return cached, True - - merged = ScanResult() - for name in _SCAN_SUBDIRS: - sub = run_dir / name - if not sub.is_dir(): - continue - part = scan_path(sub, no_verification=no_verification) - merged.bytes_scanned += part.bytes_scanned - merged.chunks_scanned += part.chunks_scanned - merged.verified.extend(part.verified) - merged.unverified.extend(part.unverified) - - try: - (run_dir / SCAN_CACHE_NAME).write_text( - json.dumps( - _result_to_dict(merged, no_verification=no_verification), - indent=2, - ) - ) - except OSError: - # Cache write isn't load-bearing — let the scan result through. - pass - return merged, False - - -__all__ = [ - "SCAN_CACHE_NAME", - "ScanHit", - "ScanResult", - "TrufflehogMissingError", - "find_trufflehog", - "load_cached_scan", - "scan_path", - "scan_run_dir", -] diff --git a/src/lib.rs b/src/lib.rs index d8d0b8d..6238658 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,7 @@ -//! agentcap — Rust port (data/UI half: export, push, scan, inspect). +//! agentcap — capture agent ↔ model interactions and publish them as HF datasets. //! -//! The capture/runtime half (`run`, proxy, sandbox, drivers) still lives in the -//! Python package under `src/agentcap/`; this crate reads the captures it writes. +//! `run` drives an agent through a corpus behind a capture proxy; `export` renders +//! the captures to parquet and pushes them to the Hub; `inspect` / `ls` browse them. pub mod captures; pub mod diff; diff --git a/src/provider.rs b/src/provider.rs index 19de462..29b3a90 100644 --- a/src/provider.rs +++ b/src/provider.rs @@ -1,9 +1,9 @@ //! Identify the inference backend behind an upstream URL. //! -//! This is the pure subset the export path needs: hostname classification -//! ([`hostname_fallback`]) + the HF Router sub-provider pin -//! ([`refine_for_sub_provider`]). The network `probe` (live introspection) is -//! part of the capture/runtime half and lives in the Python package for now. +//! Hostname classification ([`hostname_fallback`]) + the HF Router sub-provider pin +//! ([`refine_for_sub_provider`]) — what `run` and `export` use to slug a backend. +//! Live network introspection of the backend isn't implemented; the hostname slug +//! is enough for both paths. use std::net::IpAddr; diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index 7b7940b..0000000 --- a/tests/conftest.py +++ /dev/null @@ -1,514 +0,0 @@ -"""Shared pytest fixtures. - -Live tests run when prereqs are present, skip otherwise. Prereqs: - - - Agent binary present in the per-agent sandbox - (``agentcap run --agent `` once provisions it). - - ``podman`` on PATH (the fixture pulls and runs the official - ``ghcr.io/ggml-org/llama.cpp`` server image). -""" - -from __future__ import annotations - -import http.server -import os -import shutil -import socket -import socketserver -import subprocess -import sys -import threading -import time -from pathlib import Path -from urllib.request import urlopen - -import pytest - - -pytest_plugins = ["tests.fixtures.sandbox_images"] - - -def _log(msg: str) -> None: - """Write a progress line to stderr (visible with ``pytest -s``).""" - sys.stderr.write(f" [agentcap-test] {msg}\n") - sys.stderr.flush() - - -# Default test target. ``hf_hub_download`` of Qwen3-1.7B Q8_0 is the -# "click and run" path — agentcap fetches the model bytes, user -# doesn't manage GGUF files. Qwen3-1.7B is the smallest checkpoint -# in this family that chains read → edit reliably across the four -# drivers; ~1.7 GB downloads + loads on a CI runner in a couple of -# minutes. Semantic correctness is intentionally not graded; the -# live tests verify the wire path, not the agent's task quality. -_DEFAULT_GGUF_REPO = "Qwen/Qwen3-1.7B-GGUF" -_DEFAULT_GGUF_FILE = "Qwen3-1.7B-Q8_0.gguf" -_DEFAULT_MODEL_ALIAS = "Qwen3-1.7B" - -# Official llama.cpp server image, version-pinned per llama.cpp -# commit. Override via ``AGENTCAP_TEST_LLAMA_IMAGE`` to test a -# different release. CPU-only; the GPU variants are tagged -# ``server-cuda13-*`` / ``server-vulkan-*``. -_DEFAULT_LLAMA_IMAGE = "ghcr.io/ggml-org/llama.cpp:server-b9487" - - -def _fetch_default_gguf() -> str | None: - """Pull the default GGUF from HF Hub. Cached in the HF default - cache dir; first call downloads ~5GB (tqdm progress on stderr), - subsequent calls return the cached path instantly. Returns None - on any failure — caller treats that as 'skip live tests'.""" - try: - from huggingface_hub import hf_hub_download - except ImportError: - return None - _log( - f"fetching default GGUF " - f"{_DEFAULT_GGUF_REPO}/{_DEFAULT_GGUF_FILE} " - f"(cached in ~/.cache/huggingface/ after first download)…" - ) - try: - return hf_hub_download( - repo_id=_DEFAULT_GGUF_REPO, - filename=_DEFAULT_GGUF_FILE, - ) - except Exception as exc: - _log(f"GGUF download failed: {exc}") - return None - - -def _free_port() -> int: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(("127.0.0.1", 0)) - return s.getsockname()[1] - - -def _wait_ready( - url: str, timeout: float = 180.0, log=lambda msg: None, -) -> None: - """Poll a ``/v1/models`` endpoint until it responds 200 or we - blow ``timeout`` seconds. Tiny GGUFs load in seconds; the budget - is generous so we don't flake on a cold weight load. - - Emits a heartbeat every ~10s so the test runner shows progress - during a slow weight load instead of looking hung.""" - deadline = time.time() + timeout - start = time.time() - last_hb = start - while time.time() < deadline: - try: - with urlopen(url, timeout=2) as r: - if r.status == 200: - return - except Exception: - pass - now = time.time() - if now - last_hb >= 10: - log(f"waiting for llama serve… ({int(now - start)}s elapsed)") - last_hb = now - time.sleep(1) - raise RuntimeError(f"llama serve never became ready at {url}") - - -def _agent_reachable_host() -> str: - """Hostname the agent (inside the podman container) uses to reach - a host-side server. Podman exposes the host gateway as - ``host.containers.internal``.""" - return "host.containers.internal" - - -@pytest.fixture(scope="session") -def live_llama_url(): - """Host-side server root of the llama backend (no ``/v1`` suffix). - - For tests that spawn their own proxy on top and need a directly- - reachable upstream. Reuses an existing ``llama serve`` on - 8000/8080, or spawns one as a podman container. - """ - for probe_port in (8000, 8080): - try: - with urlopen( - f"http://127.0.0.1:{probe_port}/v1/models", timeout=1, - ) as r: - if r.status == 200: - _log(f"reusing existing llama serve on :{probe_port}") - yield f"http://127.0.0.1:{probe_port}" - return - except Exception: - pass - - if not shutil.which("podman"): - pytest.skip( - "podman not on PATH; install with brew install podman " - "(macOS) or apt install podman (Linux)." - ) - # macOS: bring the podman machine up before any ``podman run`` so - # a stopped/uninitialised machine surfaces as a clear skip with - # an install hint, not a generic ``podman run`` failure. - from agentcap.sandbox.podman_provisioning import ensure_machine_running - try: - ensure_machine_running(log=_log) - except RuntimeError as exc: - pytest.skip(str(exc)) - gguf = os.environ.get("AGENTCAP_TEST_GGUF") or _fetch_default_gguf() - if not gguf: - pytest.skip( - "couldn't obtain a GGUF; HF fetch failed and no " - "AGENTCAP_TEST_GGUF override set." - ) - # HF cache stores GGUFs as symlinks into ``blobs/``; the container - # needs the realpath's parent dir bound in. - real_gguf = Path(gguf).resolve() - gguf_dir = real_gguf.parent - gguf_name = real_gguf.name - - image = os.environ.get( - "AGENTCAP_TEST_LLAMA_IMAGE", _DEFAULT_LLAMA_IMAGE, - ) - port = _free_port() - ctx = os.environ.get("AGENTCAP_TEST_CTX_SIZE", "8192") - name = f"agentcap-llama-{os.getpid()}" - argv = [ - "podman", "run", "--rm", "-d", "--name", name, - "-p", f"127.0.0.1:{port}:8080", - "--mount", f"type=bind,src={gguf_dir},dst=/models,ro", - image, - "--model", f"/models/{gguf_name}", - "--host", "0.0.0.0", - "--port", "8080", - "--ctx-size", ctx, - "--reasoning-format", "none", - "--jinja", - ] - _log( - f"spawning llama container {name} on :{port} " - f"(image={image}, gguf={gguf_name}, ctx={ctx})" - ) - # 10 min covers a cold-cache image pull (~1 GB) on a slow CI - # runner plus the actual ``podman run -d`` setup. - r = subprocess.run(argv, capture_output=True, text=True, timeout=600) - if r.returncode != 0: - # ``podman run`` failing once the host has podman is a real - # problem (bad flags, pull failure, permissions), not a missing - # prereq. Fail loud so CI doesn't silently green over it. - pytest.fail(f"podman run failed: {r.stderr.strip()}") - try: - _wait_ready( - f"http://127.0.0.1:{port}/v1/models", - timeout=180, - log=_log, - ) - _log(f"llama container ready at :{port}") - yield f"http://127.0.0.1:{port}" - finally: - subprocess.run( - ["podman", "rm", "-f", name], - capture_output=True, text=True, timeout=30, - ) - - -@pytest.fixture(scope="session") -def live_proxy_base_url(live_llama_url): - """Agent-side ``/v1`` URL of the in-process capture proxy. - - For tests that exercise the agent ↔ proxy ↔ llama path from - outside. - """ - import tempfile - - from agentcap.proxy import serve_in_thread - proxy_port = _free_port() - capture_dir = tempfile.mkdtemp(prefix="agentcap-pytest-captures-") - agent_url = f"http://{_agent_reachable_host()}:{proxy_port}/v1" - _log( - f"starting in-process proxy on 0.0.0.0:{proxy_port} " - f"-> {live_llama_url} (agents reach it at {agent_url})" - ) - with serve_in_thread( - live_llama_url, capture_dir, - host="0.0.0.0", port=proxy_port, - ): - yield agent_url - - -@pytest.fixture(scope="session") -def live_model() -> str: - return os.environ.get("AGENTCAP_TEST_MODEL", _DEFAULT_MODEL_ALIAS) - - -@pytest.fixture(scope="session") -def sandbox_for( - agentcap_image_for, live_proxy_base_url, live_model, -): - """Factory: ``sandbox_for("hermes")`` returns a Sandbox keyed on - the given agent. The image fixture ensures the per-agent podman - image is built first. - - The sandbox env is seeded with ``AGENTCAP_PROXY_URL`` *and* - ``AGENTCAP_MODEL`` so the per-agent entrypoint can start — the - opencode init script bails out without ``AGENTCAP_MODEL``, which - is enough to make ``command -v opencode`` (used as a skip probe - by ``agent_proj_for``) exit non-zero and silently skip the test. - """ - from agentcap.sandbox import get_sandbox - - cache: dict[str, object] = {} - - def _get(agent: str): - if agent in cache: - return cache[agent] - agentcap_image_for(agent) - sb = get_sandbox( - agent=agent, - env={ - "AGENTCAP_PROXY_URL": live_proxy_base_url, - "AGENTCAP_MODEL": live_model, - }, - ) - cache[agent] = sb - return sb - - yield _get - for sb in cache.values(): - close = getattr(sb, "close", None) - if callable(close): - close() - - -@pytest.fixture -def agent_proj_for(sandbox_for): - """Factory: ``agent_proj_for("hermes")`` returns - ``(sandbox, proj_path)``. The sandbox is probed for the agent - binary (test skips if it's missing) and a fresh empty project - dir is minted to serve as ``cwd``. - - The dir is removed at the end of the test. - """ - created: list[tuple[object, str]] = [] - - def _build(agent: str) -> tuple[object, str]: - sb = sandbox_for(agent) - _log(f"probing {agent!r} binary in sandbox…") - r = sb.run( - ["sh", "-c", f"command -v {agent}"], check=False, timeout=10, - ) - if r.returncode != 0: - pytest.skip( - f"{agent!r} is not on the sandbox's PATH; build the " - f"agentcap-{agent} image before running live tests." - ) - proj = sb.mkdtemp(prefix=f"agentcap-{agent}-proj-") - _log(f"{agent} project: {proj}") - created.append((sb, proj)) - return sb, proj - - yield _build - for sb, proj in created: - sb.rmtree(proj) - - -@pytest.fixture -def fake_sandbox(): - """A pass-through Sandbox stub for driver/CLI unit tests that - don't actually exercise sandbox isolation. Lives only in tests; - no production code depends on it.""" - import os - import tempfile - - class _FakeSandbox: - name = "fake" - - def wrap(self, argv, *, writable_paths, deny_network=False): - return list(argv) - - def run( - self, argv, *, env=None, cwd=None, writable_paths=None, - deny_network=False, timeout=None, check=False, - ): - full_env = {**os.environ, **(env or {})} - return subprocess.run( - list(argv), env=full_env, cwd=cwd, - capture_output=True, text=True, - timeout=timeout, check=check, - ) - - def mkdtemp(self, prefix="agentcap-"): - return tempfile.mkdtemp(prefix=prefix) - - def rmtree(self, path): - shutil.rmtree(path, ignore_errors=True) - - def write_text(self, path, content): - Path(path).write_text(content) - - def read_text(self, path): - return Path(path).read_text() - - return _FakeSandbox() - - -# --------------------------------------------------------------------------- -# Fake huggingface_hub.HfApi for export tests -# --------------------------------------------------------------------------- - - -class _FakeHfApi: - """Captures HfApi calls so the export layer can be asserted on - without hitting the network. Records ``create_repo`` / - ``list_repo_files`` / ``create_commit`` for the two dataset repos - (``-captures`` + per-agent ``-traces``), and the Collections API - surface used by ``ensure_collection`` (``list_collections``, - ``create_collection``, ``add_collection_item``). - - Parquet payloads are read back so tests can assert row counts + - column sets + request_ids; bytes payloads (README.md, raw trace - files) and string-path payloads (raw trace files committed via - ``CommitOperationAdd(path_or_fileobj=str)``) are recorded as their - content.""" - - def __init__(self): - self.created_repos: list[dict] = [] - self.commits: list[dict] = [] - self.collections_created: list[dict] = [] - self.collection_items: list[dict] = [] - # Default to steady-state: README already in the repo, so - # parquet-focused tests don't see the first-push README op - # bleed into their assertions. Tests exercising first-push - # behaviour clear this. - self.existing_files: list[str] = ["README.md"] - - # Back-compat single-call accessor for older tests that only - # cared about one repo. - @property - def created_repo(self) -> dict | None: - return self.created_repos[0] if self.created_repos else None - - def create_repo(self, *, repo_id, repo_type, exist_ok, private=False): - self.created_repos.append({ - "repo_id": repo_id, "repo_type": repo_type, - "exist_ok": exist_ok, "private": private, - }) - - def list_repo_files(self, repo_id, repo_type): - return list(self.existing_files) - - def create_commit(self, *, repo_id, repo_type, operations, commit_message): - import pyarrow.parquet as pq - - op_list: list[dict] = [] - for op in operations: - entry: dict = {"path_in_repo": op.path_in_repo} - payload = op.path_or_fileobj - if isinstance(payload, (bytes, bytearray)): - entry["bytes"] = bytes(payload) - elif isinstance(payload, str) and op.path_in_repo.endswith(".parquet"): - table = pq.read_table(payload) - entry["n_rows"] = table.num_rows - entry["columns"] = list(table.column_names) - entry["request_ids"] = list(table.column("request_id").to_pylist()) - else: - # Raw file (trace JSONL/JSON). Read bytes so tests - # can introspect the committed payload. - from pathlib import Path as _Path - entry["bytes"] = _Path(payload).read_bytes() if isinstance(payload, str) else b"" - op_list.append(entry) - self.commits.append({ - "repo_id": repo_id, - "repo_type": repo_type, - "commit_message": commit_message, - "operations": op_list, - }) - - # --- Collections API --- - - def list_collections(self, *, owner=None, q=None, limit=20): - # Idempotent ensure_collection looks for an existing one by - # title; the fake starts empty and returns whatever was made. - for c in self.collections_created: - if owner and c.get("namespace") != owner: - continue - if q and q not in (c.get("title") or ""): - continue - yield _FakeCollection(c["slug"], c["title"]) - - def create_collection( - self, title, *, namespace=None, description=None, - private=False, exists_ok=False, **_, - ): - slug = f"{namespace}/{title}-deadbeef" if namespace else f"{title}-deadbeef" - record = { - "slug": slug, "title": title, "namespace": namespace, - "description": description, "private": private, - } - self.collections_created.append(record) - return _FakeCollection(slug, title) - - def add_collection_item( - self, *, collection_slug, item_id, item_type, - exists_ok=False, **_, - ): - self.collection_items.append({ - "collection_slug": collection_slug, - "item_id": item_id, - "item_type": item_type, - }) - - -class _FakeCollection: - __slots__ = ("slug", "title") - def __init__(self, slug: str, title: str) -> None: - self.slug = slug - self.title = title - - -@pytest.fixture -def fake_hf_api(monkeypatch): - fake = _FakeHfApi() - monkeypatch.setattr("huggingface_hub.HfApi", lambda *a, **kw: fake) - return fake - - -# --------------------------------------------------------------------------- -# Mock HTTP server fixture -# --------------------------------------------------------------------------- - -class _RecordingHandler(http.server.BaseHTTPRequestHandler): - """GET-only handler that records every requested path on a class - attribute. Reset per fixture invocation.""" - received_paths: list[str] = [] - - def do_GET(self): # noqa: N802 - type(self).received_paths.append(self.path) - self.send_response(200) - self.send_header("Content-Type", "application/json") - self.end_headers() - self.wfile.write(b'{"ok": true}') - - def log_message(self, *args, **kwargs): # silence the stderr noise - pass - - -@pytest.fixture -def mock_http_server(): - """Spin up a tiny in-process HTTP server on a free port for the - duration of one test. Bound to ``0.0.0.0`` so a podman container - can reach it via ``host.containers.internal``. - - Yields ``(port, received_paths)``: the port the server is - listening on, and a list (live, mutated by request handlers) - of every path the server has been hit on. Useful for asserting - a sandboxed subprocess actually made the call we expected. - """ - _RecordingHandler.received_paths = [] - # Pick a free port by binding to :0 first, then handing it off. - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(("", 0)) - port = s.getsockname()[1] - httpd = socketserver.TCPServer(("0.0.0.0", port), _RecordingHandler) - thread = threading.Thread(target=httpd.serve_forever, daemon=True) - thread.start() - try: - yield port, _RecordingHandler.received_paths - finally: - httpd.shutdown() - httpd.server_close() - thread.join(timeout=5) diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/fixtures/sandbox_images.py b/tests/fixtures/sandbox_images.py deleted file mode 100644 index 78cf932..0000000 --- a/tests/fixtures/sandbox_images.py +++ /dev/null @@ -1,134 +0,0 @@ -"""Per-agent sandbox image lifecycle: pytest fixture + CLI. - -Two callers for the same logic: - -* ``python tests/fixtures/sandbox_images.py`` — pre-build every - per-agent image as a CI setup step so the test runner doesn't pay - the cold-build cost. -* ``agentcap_image_for`` pytest fixture — same logic, on demand, - when a test requests it. - -Registered as a pytest plugin in ``tests/conftest.py`` via -``pytest_plugins``. -""" - -from __future__ import annotations - -import argparse -import fnmatch -import sys - -import pytest - -from agentcap.drivers import known_drivers -from agentcap.sandbox.podman_provisioning import ( - ensure_image, ensure_machine_running, -) - - -def _log(msg: str) -> None: - sys.stderr.write(f" [sandbox-images] {msg}\n") - sys.stderr.flush() - - -def build_one(agent: str) -> str: - ensure_machine_running(log=_log) - return ensure_image(agent, log=_log) - - -def build_many(agents: list[str]) -> dict[str, str | Exception]: - """Build each agent's image, capturing per-agent failures so CI - surfaces the full failure set in one go.""" - out: dict[str, str | Exception] = {} - for agent in agents: - try: - out[agent] = build_one(agent) - except (FileNotFoundError, RuntimeError) as exc: - out[agent] = exc - return out - - -@pytest.fixture(scope="session") -def agentcap_image_for(): - """Factory: ``agentcap_image_for("hermes")`` ensures the - per-agent podman image is built and current. Skips if podman - or its machine isn't available.""" - cache: dict[str, str] = {} - - def _ensure(agent: str) -> str: - if agent in cache: - return cache[agent] - try: - tag = build_one(agent) - except (FileNotFoundError, RuntimeError) as exc: - pytest.skip(str(exc)) - cache[agent] = tag - return tag - - return _ensure - - -# --------------------------------------------------------------------------- -# CLI -# --------------------------------------------------------------------------- - - -def main() -> int: - parser = argparse.ArgumentParser( - description=( - "Pre-build the per-agent sandbox images used by " - "`agentcap run` and the live driver tests." - ), - ) - parser.add_argument( - "--list", - action="store_true", - help="List available agent names and exit.", - ) - parser.add_argument( - "pattern", - nargs="?", - default="*", - help=( - "Glob pattern to filter agents (e.g. 'goose', 'pi', " - "'*'). Default: '*' (all)." - ), - ) - args = parser.parse_args() - - all_agents = sorted(known_drivers()) - - if args.list: - for name in all_agents: - print(name) - return 0 - - targets = [a for a in all_agents if fnmatch.fnmatch(a, args.pattern)] - if not targets: - print( - f"no agents match pattern {args.pattern!r}; " - f"available: {', '.join(all_agents)}", - file=sys.stderr, - ) - return 1 - - _log(f"building: {', '.join(targets)}") - results = build_many(targets) - - ok = {a: t for a, t in results.items() if not isinstance(t, Exception)} - failed = {a: e for a, e in results.items() if isinstance(e, Exception)} - - for agent, tag in ok.items(): - _log(f" OK {agent} -> {tag}") - for agent, exc in failed.items(): - _log(f" FAIL {agent}: {exc}") - - if failed: - _log(f"{len(failed)}/{len(targets)} failed") - return 1 - _log(f"all {len(targets)} images ready") - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/tests/live.rs b/tests/live.rs index f55ccb6..1622b49 100644 --- a/tests/live.rs +++ b/tests/live.rs @@ -171,11 +171,8 @@ fn live_goose() { // hermes and opencode are intentionally omitted — neither runs via `agentcap run` // on the tiny CI model: // - hermes: its base system prompt (~3.9k tokens) exceeds the budget on -// Qwen3-1.7B, so it bails before any model call. The Python suite never ran -// hermes through the CLI either — `test_hermes_live` drove the driver directly -// with prompt-shrinking flags (`ignore_rules`, `toolsets="file"`) that `run` -// doesn't expose. hermes stdout parsing is covered by unit tests. -// - opencode: 1.15.x doesn't pick up the baked `agent.minimal` from the image -// (matching the `@pytest.mark.skip` on `test_opencode_live`). +// Qwen3-1.7B, so it bails before any model call. hermes stdout parsing is +// covered by unit tests. +// - opencode: 1.15.x doesn't pick up the baked `agent.minimal` from the image. // pi (symlink/JSONL traces) + goose (dump-traces/SQLite) cover the full stack // across both trace-surfacing mechanisms. diff --git a/tests/test_captures.py b/tests/test_captures.py deleted file mode 100644 index 29ab09e..0000000 --- a/tests/test_captures.py +++ /dev/null @@ -1,127 +0,0 @@ -"""Unit tests for ``agentcap.captures``.""" - -from __future__ import annotations - -import json -from pathlib import Path - -import pytest - -from agentcap.captures import ( - load_request, - load_requests, - resolve_workspace_rid, -) - - -def _write_capture(d: Path, rid: str, body: dict) -> None: - (d / f"{rid}.request.json").write_text( - json.dumps({ - "request_id": rid, - "captured_at": 1, - "upstream_url": "http://localhost:8000", - "body": body, - }) - ) - - -def test_load_request_from_capture_dir(tmp_path: Path) -> None: - cap = tmp_path / "captures" - cap.mkdir() - body = {"model": "m", "messages": [{"role": "user", "content": "hi"}]} - _write_capture(cap, "abc", body) - - assert load_request(str(cap), "abc") == body - - -def test_load_requests_batch_from_capture_dir(tmp_path: Path) -> None: - cap = tmp_path / "captures" - cap.mkdir() - _write_capture(cap, "a", {"model": "m", "messages": []}) - _write_capture(cap, "b", {"model": "m", "messages": [{"role": "user"}]}) - - out = load_requests(str(cap), ["a", "b"]) - assert set(out) == {"a", "b"} - assert out["a"]["messages"] == [] - - -def test_load_request_missing_id_raises(tmp_path: Path) -> None: - cap = tmp_path / "captures" - cap.mkdir() - _write_capture(cap, "a", {"model": "m"}) - - with pytest.raises(KeyError): - load_request(str(cap), "ghost") - - -def test_load_request_from_parquet(tmp_path: Path) -> None: - """Round-trip a body through ``export_local`` and back via the loader.""" - from agentcap.export import export_local - - cap = tmp_path / "captures" - cap.mkdir() - body = { - "model": "m", - "messages": [{"role": "user", "content": "hello"}], - "tools": [], - } - _write_capture(cap, "rid", body) - # Pair with a minimal response file so export_local has both halves. - (cap / "rid.response.json").write_text(json.dumps({ - "request_id": "rid", "captured_at_resp": 2, - "status_code": 200, "body": {"choices": []}, - })) - - parquet = tmp_path / "out.parquet" - n = export_local(cap, parquet, progress=False) - assert n == 1 - - loaded = load_request(str(parquet), "rid") - assert loaded == body - - -def test_load_requests_bad_source(tmp_path: Path) -> None: - not_a_thing = tmp_path / "nope.txt" - not_a_thing.write_text("x") - with pytest.raises(ValueError): - load_requests(str(not_a_thing), ["a"]) - - -def test_resolve_workspace_rid_finds_run(tmp_path: Path) -> None: - ws = tmp_path / ".agentcap" - run = ws / "hermes-local-20260101-000000" - cap = run / "captures" - cap.mkdir(parents=True) - _write_capture(cap, "rid-target", {"model": "m"}) - - found = resolve_workspace_rid(ws, "rid-target") - assert found == (cap, "rid-target") - - -def test_resolve_workspace_rid_accepts_prefix(tmp_path: Path) -> None: - ws = tmp_path / ".agentcap" - cap = ws / "hermes-local-20260101-000000" / "captures" - cap.mkdir(parents=True) - _write_capture(cap, "abc12345deadbeef", {"model": "m"}) - - found = resolve_workspace_rid(ws, "abc12345") - assert found == (cap, "abc12345deadbeef") - - -def test_resolve_workspace_rid_ambiguous_prefix_raises(tmp_path: Path) -> None: - from agentcap.captures import AmbiguousRequestId - - ws = tmp_path / ".agentcap" - cap = ws / "hermes-local-20260101-000000" / "captures" - cap.mkdir(parents=True) - _write_capture(cap, "abc12345_a", {"model": "m"}) - _write_capture(cap, "abc12345_b", {"model": "m"}) - - with pytest.raises(AmbiguousRequestId): - resolve_workspace_rid(ws, "abc12345") - - -def test_resolve_workspace_rid_returns_none_when_absent(tmp_path: Path) -> None: - ws = tmp_path / ".agentcap" - ws.mkdir() - assert resolve_workspace_rid(ws, "ghost") is None diff --git a/tests/test_cli.py b/tests/test_cli.py deleted file mode 100644 index 0531e9e..0000000 --- a/tests/test_cli.py +++ /dev/null @@ -1,451 +0,0 @@ -"""CLI smoke tests for `agentcap`. - -These do not actually start a uvicorn server — they patch out -``agentcap.proxy.serve_in_thread`` and assert the right kwargs are -computed from the CLI flags. The proxy itself has its own integration -test suite. -""" - -from __future__ import annotations - -import os -import shutil -import types -from pathlib import Path - -import pytest -from click.testing import CliRunner - -from agentcap.__main__ import cli - - -def _has_trufflehog() -> bool: - if shutil.which("trufflehog"): - return True - local = Path.home() / ".local" / "bin" / "trufflehog" - return local.is_file() and os.access(local, os.X_OK) - - -_HAS_TRUFFLEHOG = _has_trufflehog() - - -@pytest.fixture( - params=[ - pytest.param([], id="scan"), - pytest.param(["--no-scan"], id="no-scan"), - ] -) -def scan_args(request): - """Yields ``[]`` (scan on, the default) or ``["--no-scan"]``. - - The scan-on variant requires trufflehog on PATH (or - ~/.local/bin); without it, that parametrisation is skipped so - the no-scan variant still runs.""" - if not request.param and not _HAS_TRUFFLEHOG: - pytest.skip("trufflehog not installed; cannot exercise scan path") - return request.param - - -def test_help_lists_subcommands(): - runner = CliRunner() - result = runner.invoke(cli, ["--help"]) - assert result.exit_code == 0 - for sub in ("export", "run"): - assert sub in result.output - - -def test_version_flag(): - from agentcap import __version__ - - runner = CliRunner() - result = runner.invoke(cli, ["--version"]) - assert result.exit_code == 0 - assert __version__ in result.output - - -def test_run_requires_agent_upstream_and_workdir(): - runner = CliRunner() - result = runner.invoke(cli, ["run"]) - assert result.exit_code != 0 - # Click reports the first missing required option - assert "--agent" in result.output - - -# Plumbing for ``agentcap run`` (CLI flag → env-var composition → -# orchestrator → run.json shape) is exercised end-to-end against a -# real model server in ``tests/test_cli_live.py::test_agentcap_run_live``. -# It replaces two previously heavily-mocked unit tests; the live test -# touches the real proxy + sandbox + agent so we don't have to stub -# them here. - - -def test_export_requires_push(tmp_path: Path): - runner = CliRunner() - result = runner.invoke(cli, ["export", str(tmp_path)]) - assert result.exit_code != 0 - assert "--push" in result.output - - -def test_export_requires_targets_or_all(tmp_path: Path): - runner = CliRunner() - result = runner.invoke( - cli, ["export", "--push", "me/d"] - ) - assert result.exit_code != 0 - assert "run-ids" in result.output or "--all" in result.output - - -def test_export_rejects_both_targets_and_all(tmp_path: Path): - capture = tmp_path / "capture" - capture.mkdir() - runner = CliRunner() - result = runner.invoke( - cli, - ["export", str(capture), "--all", "--push", "me/d"], - ) - assert result.exit_code != 0 - assert "not both" in result.output - - -def test_run_hf_router_api_key_auto_from_hf_token_env( - tmp_path: Path, monkeypatch, fake_sandbox -): - import contextlib - - from agentcap.drivers import AgentTurn - - tasks = tmp_path / "tasks.txt" - tasks.write_text("a task\n") - - class _FakeDriver: - name = "hermes" - - def start(self, prompt, *, env=None, timeout=None): - return AgentTurn( - session_id="ses_xyz", response_text="r", returncode=0, - stdout="", stderr="", - ) - - def resume(self, prompt, *, session_id, env=None, timeout=None): - return AgentTurn( - session_id=session_id, response_text="r", returncode=0, - stdout="", stderr="", - ) - - monkeypatch.setattr( - "agentcap.drivers.get_driver", lambda name, **kw: _FakeDriver() - ) - monkeypatch.setattr( - "agentcap.sandbox.require_sandbox_or_die", - lambda **kw: fake_sandbox, - ) - - @contextlib.contextmanager - def fake_proxy(*args, **kwargs): - yield types.SimpleNamespace( - host="127.0.0.1", port=18001, - set_context=lambda **_: None, - ) - - monkeypatch.setattr("agentcap.proxy.serve_in_thread", fake_proxy) - monkeypatch.setenv("HF_TOKEN", "hf_env_token") - monkeypatch.setenv("AGENTCAP_WORKSPACE", str(tmp_path)) - monkeypatch.chdir(tmp_path) - - runner = CliRunner() - result = runner.invoke( - cli, - [ - "run", - "--agent", "hermes", - "--model", "Qwen/Qwen3-8B", - "--upstream", "https://router.huggingface.co", - "--tasks", str(tasks), - "--turns", "1", - ], - ) - assert result.exit_code == 0, result.output - assert "HF Router token source=HF_TOKEN" in result.output - - -def _write_capture(capture_dir: Path, rid: str, model: str) -> None: - import json - (capture_dir / f"{rid}.request.json").write_text(json.dumps({ - "request_id": rid, "captured_at": 1, - "body": {"model": model, "messages": []}, - })) - - -def test_export_auto_detects_model_from_captures( - tmp_path: Path, fake_hf_api, scan_args, -): - """The model auto-detected from captures lands in the committed filename. - Runs under both scan modes — the scan path doesn't change the - parquet shape, but exercising both keeps the gate honest.""" - capture = tmp_path / "capture" - capture.mkdir() - _write_capture(capture, "abcdef12", "google/gemma-4-E4B-it") - - result = CliRunner().invoke( - cli, ["export", str(capture), "--push", "me/d", *scan_args], - ) - assert result.exit_code == 0, result.output - op = fake_hf_api.commits[0]["operations"][0] - assert "gemma-4-E4B-it" in op["path_in_repo"] - - -def test_export_auto_detect_fails_on_mixed_models(tmp_path: Path): - """Captures spanning multiple models fail loudly.""" - capture = tmp_path / "capture" - capture.mkdir() - _write_capture(capture, "a", "model-1") - _write_capture(capture, "b", "model-2") - - result = CliRunner().invoke( - cli, ["export", str(capture), "--push", "me/d"], - ) - assert result.exit_code != 0 - assert "multiple models" in result.output - - -def test_export_no_model_in_captures_fails(tmp_path: Path): - """A capture dir with no model field at all is a hard error.""" - import json - capture = tmp_path / "capture" - capture.mkdir() - (capture / "abcdef12.request.json").write_text(json.dumps({ - "request_id": "abcdef12", "captured_at": 1, - "body": {"messages": []}, - })) - - result = CliRunner().invoke( - cli, ["export", str(capture), "--push", "me/d"], - ) - assert result.exit_code != 0 - assert "no captured requests with a model field" in result.output - - -def test_export_push_rejects_malformed_dataset_uri(tmp_path: Path): - capture = tmp_path / "capture" - capture.mkdir() - _write_capture(capture, "abcdef12", "m") - - result = CliRunner().invoke( - cli, ["export", str(capture), "--push", "just-an-owner"], - ) - assert result.exit_code != 0 - assert "/" in result.output - - -def test_export_resolves_workdir_layout_and_reads_agent_from_run_json( - tmp_path: Path, fake_hf_api, scan_args, -): - """Pointing export at a workdir uses its captures/ subdir AND picks up - agent from run.json so the parquet filename embeds the agent.""" - import json - workdir = tmp_path / "ws" / "hermes-local-20260512-162345" - captures = workdir / "captures" - captures.mkdir(parents=True) - _write_capture(captures, "abcdef12", "google/gemma-4-E4B-it") - (workdir / "run.json").write_text(json.dumps({"agent": "hermes"})) - - result = CliRunner().invoke( - cli, ["export", str(workdir), "--push", "me/d", *scan_args], - ) - assert result.exit_code == 0, result.output - op = fake_hf_api.commits[0]["operations"][0] - assert "hermes" in op["path_in_repo"] - - -def test_export_all_walks_workspace_in_one_commit( - tmp_path: Path, monkeypatch, fake_hf_api, scan_args, -): - """--all enumerates every run-id in the workspace and pushes them all - in one git commit.""" - import json - monkeypatch.setenv("AGENTCAP_WORKSPACE", str(tmp_path)) - monkeypatch.chdir(tmp_path) - ws = tmp_path / ".agentcap" - for run_id in ("hermes-local-20260512-160000", "goose-local-20260512-170000"): - d = ws / run_id / "captures" - d.mkdir(parents=True) - _write_capture(d, "abcdef12", "m") - (ws / run_id / "run.json").write_text(json.dumps({ - "agent": run_id.split("-")[0], - })) - - result = CliRunner().invoke( - cli, ["export", "--all", "--push", "me/d", *scan_args], - ) - assert result.exit_code == 0, result.output - assert len(fake_hf_api.commits) == 1 - assert len(fake_hf_api.commits[0]["operations"]) == 2 - - -def test_ls_defaults_to_cwd(tmp_path: Path, monkeypatch): - """Without WORKSPACE, ``ls`` looks at ``./.agentcap/``.""" - monkeypatch.chdir(tmp_path) - _seed_workspace_run_with_meta(tmp_path, "hermes-local-20260512-160000") - result = CliRunner().invoke(cli, ["ls"]) - assert result.exit_code == 0, result.output - assert "hermes-local-20260512-160000" in result.output - - -def test_ls_ignores_env_var(tmp_path: Path, monkeypatch): - """``ls`` MUST NOT consult ``$AGENTCAP_WORKSPACE`` — it's the only - way to keep the command's output a function of its arguments.""" - other = tmp_path / "other" - other.mkdir() - _seed_workspace_run_with_meta(other, "hermes-local-20260512-160000") - monkeypatch.setenv("AGENTCAP_WORKSPACE", str(other)) - monkeypatch.chdir(tmp_path) # cwd has no .agentcap/ - result = CliRunner().invoke(cli, ["ls"]) - # Falls back to ./.agentcap/ (which doesn't exist), NOT to $AGENTCAP_WORKSPACE. - assert result.exit_code == 0 - assert "no workspace" in result.output - - -def test_ls_accepts_parent_dir(tmp_path: Path): - """``ls `` finds ``/.agentcap/``.""" - _seed_workspace_run_with_meta(tmp_path, "hermes-local-20260512-160000") - result = CliRunner().invoke(cli, ["ls", str(tmp_path)]) - assert result.exit_code == 0, result.output - assert "hermes-local-20260512-160000" in result.output - - -def test_ls_accepts_dot_agentcap_dir(tmp_path: Path): - """``ls /.agentcap`` works too — same listing either way.""" - _seed_workspace_run_with_meta(tmp_path, "hermes-local-20260512-160000") - result = CliRunner().invoke(cli, ["ls", str(tmp_path / ".agentcap")]) - assert result.exit_code == 0, result.output - assert "hermes-local-20260512-160000" in result.output - - -def test_ls_accepts_dot_from_inside_workspace(tmp_path: Path, monkeypatch): - """``ls .`` from inside a ``.agentcap/`` dir lists that workspace — - ``Path('.').name`` is ``''`` so the classifier must normalize.""" - _seed_workspace_run_with_meta(tmp_path, "hermes-local-20260512-160000") - monkeypatch.chdir(tmp_path / ".agentcap") - result = CliRunner().invoke(cli, ["ls", "."]) - assert result.exit_code == 0, result.output - assert "hermes-local-20260512-160000" in result.output - - -def test_ls_missing_workspace_message(tmp_path: Path, monkeypatch): - """Missing-workspace error is silent about ``$AGENTCAP_WORKSPACE`` - since ``ls`` doesn't consult it.""" - monkeypatch.chdir(tmp_path) - result = CliRunner().invoke(cli, ["ls"]) - assert result.exit_code == 0 - assert "AGENTCAP_WORKSPACE" not in result.output - assert "no workspace" in result.output - - -def _seed_workspace_run(root: Path, run_id: str, rids: list[tuple[str, str]]) -> None: - """Create a fake workspace run with captures for each (rid, prompt).""" - import json as _json - cap = root / ".agentcap" / run_id / "captures" - cap.mkdir(parents=True) - for i, (rid, prompt) in enumerate(rids): - body = {"model": "m", "messages": [{"role": "user", "content": prompt}]} - (cap / f"{rid}.request.json").write_text(_json.dumps({ - "request_id": rid, "captured_at": 1000 + i, - "upstream_url": "http://x", "body": body, - })) - (cap / f"{rid}.response.json").write_text(_json.dumps({ - "request_id": rid, "captured_at_resp": 1001 + i, - "status_code": 200, "body": {}, - })) - - -def _seed_workspace_run_with_meta( - root: Path, run_id: str, *, agent: str = "hermes", model: str = "m", -) -> None: - """Like _seed_workspace_run but also writes a minimal run.json so - the run picker discovers it.""" - import json as _json - _seed_workspace_run(root, run_id, [("aaa", "p1")]) - (root / ".agentcap" / run_id / "run.json").write_text(_json.dumps({ - "agent": agent, "model": model, "upstream": "http://x", - "turns_per_task": 1, - "tasks": [{ - "task_id": "task_01", "prompt": "p1", "completed_turns": 1, - "turns": [{"turn": 1, "returncode": 0, "duration_s": 1.0}], - }], - })) - - -def test_inspect_resolves_rid_from_workspace(tmp_path: Path, monkeypatch): - import json as _json - monkeypatch.setenv("AGENTCAP_WORKSPACE", str(tmp_path)) - monkeypatch.chdir(tmp_path) - cap = tmp_path / ".agentcap" / "hermes-local-20260101-000000" / "captures" - cap.mkdir(parents=True) - body = {"model": "m", "messages": [{"role": "user", "content": "hi"}]} - (cap / "abcdef12.request.json").write_text(_json.dumps({ - "request_id": "abcdef12", "captured_at": 1, - "upstream_url": "http://x", "body": body, - })) - (cap / "abcdef12.response.json").write_text(_json.dumps({ - "request_id": "abcdef12", "captured_at_resp": 2, - "status_code": 200, "body": {}, - })) - - result = CliRunner().invoke(cli, ["inspect", "abcdef12"]) - assert result.exit_code == 0, result.stderr - assert _json.loads(result.stdout) == body - - -def test_inspect_unknown_rid_errors(tmp_path: Path, monkeypatch): - monkeypatch.setenv("AGENTCAP_WORKSPACE", str(tmp_path)) - monkeypatch.chdir(tmp_path) - (tmp_path / ".agentcap").mkdir() - result = CliRunner().invoke(cli, ["inspect", "ghost"]) - assert result.exit_code != 0 - assert "ghost" in result.output - - -def test_inspect_run_id_errors_without_fzf(tmp_path: Path, monkeypatch): - """``inspect `` needs the request picker; without fzf on PATH - the command errors out with a clear message instead of dumping a - half-usable table.""" - monkeypatch.setenv("AGENTCAP_WORKSPACE", str(tmp_path)) - monkeypatch.chdir(tmp_path) - monkeypatch.setenv("PATH", "") - # _seed_workspace_run_with_meta writes the run.json the classifier - # needs to recognise the dashed name as a run-id under cwd's - # ``.agentcap`` (otherwise it falls through to other rules). - _seed_workspace_run_with_meta( - tmp_path, "hermes-local-20260101-000000", - agent="hermes", model="m", - ) - - result = CliRunner().invoke(cli, ["inspect", "hermes-local-20260101-000000"]) - assert result.exit_code != 0 - assert "fzf is required" in result.output - - -def test_inspect_no_arg_errors_without_fzf(tmp_path: Path, monkeypatch): - """``inspect`` with no arg also needs the run picker; same error.""" - monkeypatch.setenv("AGENTCAP_WORKSPACE", str(tmp_path)) - monkeypatch.chdir(tmp_path) - monkeypatch.setenv("PATH", "") - _seed_workspace_run_with_meta( - tmp_path, "hermes-local-20260101-000000", - agent="hermes", model="m", - ) - - result = CliRunner().invoke(cli, ["inspect"]) - assert result.exit_code != 0 - assert "fzf is required" in result.output - - -def test_inspect_no_arg_empty_workspace_errors(tmp_path: Path, monkeypatch): - monkeypatch.setenv("AGENTCAP_WORKSPACE", str(tmp_path)) - monkeypatch.chdir(tmp_path) - (tmp_path / ".agentcap").mkdir() - result = CliRunner().invoke(cli, ["inspect"]) - assert result.exit_code != 0 - assert "no runs" in result.output or "no workspace" in result.output - - diff --git a/tests/test_cli_live.py b/tests/test_cli_live.py deleted file mode 100644 index 7dfc3f6..0000000 --- a/tests/test_cli_live.py +++ /dev/null @@ -1,106 +0,0 @@ -"""End-to-end live test for ``agentcap run``. - -Exercises the full CLI → orchestrator → sandbox → real agent path -against a real OpenAI-compat ``/v1`` server (the live fixture spawns -``ghcr.io/ggml-org/llama.cpp:server`` as a sibling podman container). -Replaces the heavily-mocked plumbing tests previously in -``test_cli.py``: -``test_run_synthesized_defaults_from_upstream_and_model`` and -``test_run_invokes_orchestrator_under_proxy``. - -Pi is the agent under test — its image install is small, sessions -stream as per-file JSONL through the symlink (no SQLite dump -required), and it's the most CI-friendly of the four agents. -""" - -from __future__ import annotations - -import json -from pathlib import Path - -import pytest -from click.testing import CliRunner - -from agentcap.__main__ import cli - - -@pytest.mark.live -def test_agentcap_run_live( - tmp_path: Path, - monkeypatch: pytest.MonkeyPatch, - live_llama_url: str, - live_model: str, - agentcap_image_for, -): - """``agentcap run --agent pi`` against a real model server. - - Verifies the CLI plumbing the mocked tests used to cover: - - flag parsing → ``AGENTCAP_PROXY_URL`` / ``AGENTCAP_MODEL`` / - ``AGENTCAP_PROVIDER`` / ``AGENTCAP_TRACES_DIR`` / - ``AGENTCAP_STATE_DIR`` reach the sandbox, - - the in-process proxy wraps the orchestrator (captures land in - ``/captures/``), - - per-run ``traces/`` is populated as the agent runs (pi streams - JSONL through the in-container symlink), - - ``run.json`` summary is written with the right shape. - - No internal monkeypatching — the only env manipulation is - ``AGENTCAP_WORKSPACE`` (a legitimate CLI input). - """ - # Pre-build the pi image. The fixture is also pulled in by the - # sandbox-using live tests; first call builds, subsequent calls - # are a no-op. - agentcap_image_for("pi") - - tasks = tmp_path / "tasks.txt" - tasks.write_text("Say hello in one short sentence, then stop.\n") - - workspace = tmp_path / "ws" - workspace.mkdir() - monkeypatch.setenv("AGENTCAP_WORKSPACE", str(workspace)) - - # ``--upstream`` wants the server root reachable from the host - # (the in-process proxy will forward to it). ``live_llama_url`` - # is host-side; ``live_proxy_base_url`` is agent-side and would - # not resolve from the host process. - upstream = live_llama_url - - runner = CliRunner() - result = runner.invoke( - cli, - [ - "run", - "--agent", "pi", - "--model", live_model, - "--upstream", upstream, - "--tasks", str(tasks), - "--turns", "1", - "--timeout", "180", - ], - ) - assert result.exit_code == 0, result.output - - # One run dir was created under the workspace. - run_dirs = sorted((workspace / ".agentcap").glob("pi-*")) - assert len(run_dirs) == 1, run_dirs - run_dir = run_dirs[0] - - # run.json shape — same assertions the mocked predecessor made. - summary = json.loads((run_dir / "run.json").read_text()) - assert summary["agent"] == "pi" - assert summary["model"] == live_model - assert summary["upstream"] == upstream - assert summary["turns_per_task"] == 1 - assert len(summary["tasks"]) == 1 - task = summary["tasks"][0] - assert task["completed_turns"] == 1 - assert task["session_id"], "pi should mint a session id" - - # Captures landed on disk via the in-process proxy. - captures = list((run_dir / "captures").glob("*.request.json")) - assert captures, "proxy should have captured at least one request" - - # Pi's native session JSONL landed via the in-container symlink. - traces = list((run_dir / "traces").iterdir()) - assert traces, "pi should have streamed at least one trace file" - assert any(f.suffix == ".jsonl" for f in traces) diff --git a/tests/test_drivers.py b/tests/test_drivers.py deleted file mode 100644 index c7a3474..0000000 --- a/tests/test_drivers.py +++ /dev/null @@ -1,239 +0,0 @@ -"""Pure-Python tests for ``agentcap.drivers``. - -These cover the *parser*, *config-builder*, and *overlay-scaffolding* -helpers — none of which shell out to a real agent. Live integration -tests for each driver (which actually invoke the agent CLI against a -running model server) live in ``test_drivers_live.py`` and are gated -on the agent binary being available in the sandbox image and on -``podman`` being on the host PATH. -""" - -from __future__ import annotations - - -import pytest -import yaml - -from agentcap.drivers import get_driver -from agentcap.drivers.goose import GooseDriver -from agentcap.drivers.hermes import ( - HermesDriver, - _rewrite_config, - parse_response_text as hermes_parse, - parse_session_id, -) -from agentcap.drivers.opencode import ( - OpenCodeDriver, - build_opencode_config, - parse_response_text as opencode_parse, - parse_session_id as opencode_parse_session, -) -from agentcap.drivers.pi import PiDriver, build_models_json - - -# --------------------------------------------------------------------------- -# Hermes parsers -# --------------------------------------------------------------------------- - - -def test_parse_session_id_finds_id(): - s = "blah\nsession_id: abc123_xyz\nmore\n" - assert parse_session_id(s) == "abc123_xyz" - - -def test_parse_session_id_missing_returns_none(): - assert parse_session_id("nothing here") is None - - -def test_hermes_parse_response_initial_run(): - out = "Working on it...\nHere is the answer.\n" - assert hermes_parse(out) == "Working on it...\nHere is the answer." - - -def test_hermes_parse_response_after_resumed_marker(): - out = ( - "↻ Resumed abc123\n" - "old content\n" - "↻ Resumed abc123\n" - "the actual final answer\n" - "across two lines\n" - ) - assert hermes_parse(out) == "the actual final answer\nacross two lines" - - -def test_hermes_parse_response_strips_session_id_lines(): - out = "session_id: aa_bb\nactual response\n" - assert hermes_parse(out) == "actual response" - - -# --------------------------------------------------------------------------- -# OpenCode parsers + config builder -# --------------------------------------------------------------------------- - - -def test_opencode_parse_concatenates_text_events(): - stdout = ( - '{"type":"step_start"}\n' - '{"type":"text","text":"hello "}\n' - '{"type":"text","text":"world"}\n' - '{"type":"step_finish"}\n' - ) - assert opencode_parse(stdout) == "hello world" - - -def test_opencode_parse_skips_malformed_lines(): - stdout = ( - "not json at all\n" - '{"type":"text","text":"good"}\n' - "\n" - ) - assert opencode_parse(stdout) == "good" - - -def test_build_opencode_config_shape(): - cfg = build_opencode_config( - provider_name="local", - base_url="http://127.0.0.1:8001/v1", - model_id="qwen-test", - ) - prov = cfg["provider"]["local"] - assert prov["options"]["baseURL"] == "http://127.0.0.1:8001/v1" - assert "qwen-test" in prov["models"] - assert cfg["model"] == "local/qwen-test" - - -# --------------------------------------------------------------------------- -# pi config builder -# --------------------------------------------------------------------------- - - -def test_pi_build_models_json_shape(): - payload = build_models_json( - provider_name="local", - base_url="http://127.0.0.1:8001/v1", - model_id="qwen-test", - ) - prov = payload["providers"]["local"] - assert prov["baseUrl"] == "http://127.0.0.1:8001/v1" - assert prov["api"] == "openai-completions" - # llama.cpp's OpenAI shim doesn't accept the developer role pi - # uses for reasoning-capable models — the config must downgrade. - assert prov["compat"]["supportsDeveloperRole"] is False - assert prov["compat"]["supportsReasoningEffort"] is False - assert prov["models"][0]["id"] == "qwen-test" - - -# --------------------------------------------------------------------------- -# Driver registry + non-resumable driver behaviour -# --------------------------------------------------------------------------- - - -def test_get_driver_known_names(fake_sandbox): - assert isinstance(get_driver("hermes", sandbox=fake_sandbox), HermesDriver) - assert isinstance(get_driver("opencode", sandbox=fake_sandbox), OpenCodeDriver) - assert isinstance(get_driver("goose", sandbox=fake_sandbox), GooseDriver) - assert isinstance(get_driver("pi", sandbox=fake_sandbox), PiDriver) - - -def test_get_driver_unknown_name(fake_sandbox): - with pytest.raises(ValueError): - get_driver("not-a-real-driver", sandbox=fake_sandbox) - - -def test_opencode_parse_session_id_finds_top_level(): - stdout = ( - '{"type":"step_start","sessionID":"ses_abc123"}\n' - '{"type":"text","text":"hi"}\n' - ) - assert opencode_parse_session(stdout) == "ses_abc123" - - -def test_opencode_parse_session_id_finds_nested_under_part(): - stdout = ( - '{"type":"step_finish","timestamp":1,"part":{"sessionID":"ses_xyz"}}\n' - ) - assert opencode_parse_session(stdout) == "ses_xyz" - - -def test_opencode_parse_session_id_missing_returns_none(): - assert opencode_parse_session('{"type":"text","text":"hi"}\n') is None - - -def test_hermes_driver_close_is_idempotent(fake_sandbox): - drv = HermesDriver(sandbox=fake_sandbox) - drv.close() - drv.close() # second call should not raise - - -# --------------------------------------------------------------------------- -# Hermes overlay HERMES_HOME (proxy_base_url support) -# --------------------------------------------------------------------------- - - -def test_rewrite_config_replaces_base_url_only(): - text = ( - "model:\n" - " provider: custom\n" - " base_url: http://localhost:8000/v1\n" - " key_env: OPENAI_API_KEY\n" - ) - out = _rewrite_config(text, base_url="http://127.0.0.1:8001/v1") - assert "base_url: http://127.0.0.1:8001/v1" in out - assert "http://localhost:8000/v1" not in out - # other keys preserved - assert "key_env: OPENAI_API_KEY" in out - assert "provider: custom" in out - # no context_length added when override not requested - assert "context_length" not in out - - -def test_rewrite_config_inserts_model_section_when_missing(): - out = _rewrite_config("", base_url="http://x:1/v1") - assert "base_url: http://x:1/v1" in out - - -def test_rewrite_config_overrides_both_context_length_guards(): - """Hermes refuses startup if EITHER ``model.context_length`` or - ``auxiliary.compression.context_length`` is below 64 K. The - overlay must override both.""" - text = "model:\n provider: custom\n base_url: http://localhost:8000/v1\n" - out = _rewrite_config( - text, - base_url="http://127.0.0.1:8001/v1", - context_length_override=65536, - ) - cfg = yaml.safe_load(out) - assert cfg["model"]["context_length"] == 65536 - assert cfg["auxiliary"]["compression"]["context_length"] == 65536 - assert cfg["model"]["base_url"] == "http://127.0.0.1:8001/v1" - - -def test_rewrite_config_preserves_existing_auxiliary_keys(): - text = ( - "model:\n" - " provider: custom\n" - " base_url: http://localhost:8000/v1\n" - "auxiliary:\n" - " compression:\n" - " model: my-compressor\n" - " other_key: keep_me\n" - ) - out = _rewrite_config( - text, - base_url="http://x/v1", - context_length_override=65536, - ) - cfg = yaml.safe_load(out) - assert cfg["auxiliary"]["compression"]["model"] == "my-compressor" - assert cfg["auxiliary"]["compression"]["context_length"] == 65536 - assert cfg["auxiliary"]["other_key"] == "keep_me" - - -# NOTE: the host-side `build_overlay_hermes_home` function was -# removed when HermesDriver moved its overlay logic *inside* the -# sandbox. Behaviour previously verified by 5 unit tests against -# fake user-homes is now covered by the live driver test -# (tests/test_drivers_live.py::test_hermes_live) which exercises -# the full path against a real podman container. The pure-Python -# parts that survived as standalone helpers (`_rewrite_config`) -# keep their own tests above. diff --git a/tests/test_drivers_live.py b/tests/test_drivers_live.py deleted file mode 100644 index 2bd3195..0000000 --- a/tests/test_drivers_live.py +++ /dev/null @@ -1,110 +0,0 @@ -"""Live integration tests for each agent driver. - -Verifies the infrastructure path only: agent runs inside its per-agent -podman container, dials the in-process capture proxy, and gets a -response back. Agent output *quality* — whether the model emits a -syntactically valid tool call, whether it picks the right file, etc. -— is intentionally not asserted. A separate (model-grading) test -would be the place for that. - -Assertions per agent: ``returncode == 0`` and ``turn.response_text`` -non-empty (the agent received at least one model response through -the proxy). -""" - -from __future__ import annotations - -import pytest - -from agentcap.drivers.goose import GooseDriver -from agentcap.drivers.hermes import HermesDriver -from agentcap.drivers.opencode import OpenCodeDriver -from agentcap.drivers.pi import PiDriver - - -INFRA_PROMPT = "Say hi, then stop." - - -def _assert_infrastructure_works(turn) -> None: - assert turn.returncode == 0, ( - f"agent exited rc={turn.returncode}\n" - f"--- stdout (tail) ---\n{turn.stdout[-500:]}\n" - f"--- stderr (tail) ---\n{turn.stderr[-500:]}" - ) - assert turn.response_text, ( - f"agent produced no response text — wire path may be broken.\n" - f"--- stdout (tail) ---\n{turn.stdout[-500:]}\n" - f"--- stderr (tail) ---\n{turn.stderr[-500:]}" - ) - - -@pytest.mark.live -def test_goose_live(live_proxy_base_url, live_model, agent_proj_for): - sandbox, proj = agent_proj_for("goose") - drv = GooseDriver( - sandbox=sandbox, binary="goose", model=live_model, cwd=proj, - ) - try: - turn = drv.start(INFRA_PROMPT, timeout=900) - assert turn.session_id and turn.session_id.startswith("agentcap-") - _assert_infrastructure_works(turn) - finally: - drv.close() - - -@pytest.mark.live -def test_pi_live(live_proxy_base_url, live_model, agent_proj_for): - sandbox, proj = agent_proj_for("pi") - drv = PiDriver( - sandbox=sandbox, binary="pi", model=live_model, cwd=proj, - ) - try: - turn = drv.start(INFRA_PROMPT, timeout=900) - _assert_infrastructure_works(turn) - finally: - drv.close() - - -@pytest.mark.live -@pytest.mark.skip( - reason=( - "opencode 1.15.x doesn't pick up the baked ``agent.minimal`` from " - "``~/.config/opencode/opencode.json`` inside the per-agent " - "container — fails with ``agent \"minimal\" not found`` and " - "``Model not found`` even with ``mode: primary`` + explicit " - "model. Needs investigation: instrument the init script with " - "``opencode debug config`` to see what config opencode actually " - "resolves." - ) -) -def test_opencode_live(live_proxy_base_url, live_model, agent_proj_for): - sandbox, proj = agent_proj_for("opencode") - # OpenCode recursively globs from / in empty dirs; seed a - # package.json to bound its exploration. - sandbox.write_text( - f"{proj}/package.json", '{"name":"smoke","version":"0.0.0"}\n' - ) - drv = OpenCodeDriver( - sandbox=sandbox, binary="opencode", model=live_model, cwd=proj, - minimal_agent=True, - ) - try: - turn = drv.start(INFRA_PROMPT, timeout=900) - _assert_infrastructure_works(turn) - finally: - drv.close() - - -@pytest.mark.live -def test_hermes_live(live_proxy_base_url, agent_proj_for): - sandbox, proj = agent_proj_for("hermes") - drv = HermesDriver( - sandbox=sandbox, binary="hermes", cwd=proj, - ignore_rules=True, toolsets="file", - ) - try: - turn = drv.start(INFRA_PROMPT, timeout=900) - assert turn.session_id is not None - _assert_infrastructure_works(turn) - finally: - drv.close() diff --git a/tests/test_export.py b/tests/test_export.py deleted file mode 100644 index c67169a..0000000 --- a/tests/test_export.py +++ /dev/null @@ -1,541 +0,0 @@ -"""Unit tests for ``agentcap.export``. - -Captures + traces are now pushed to a paired ``-captures`` / -``--traces`` dataset pair under a single HF Collection. The -tests assert: URI parsing, repo-id derivation, the parquet payload -shape (incl. the new ``run_id`` column), the raw-JSONL trace upload, -the README cross-links, and ``ensure_collection`` idempotency. -""" - -from __future__ import annotations - -import json -from pathlib import Path - -import pytest - -from agentcap.export import ( - _row, - captures_repo_id, - detect_model, - detect_provider_columns, - ensure_collection, - export_local, - parse_collection_base, - push_agent_traces_dataset, - push_captures_dataset, - traces_repo_id_for, -) - - -def _write_capture( - capture_dir: Path, - rid: str, - body: dict, - response: dict, - *, - upstream_url: str = "http://127.0.0.1:8000", - upstream_fingerprint: dict | None = None, -) -> None: - (capture_dir / f"{rid}.request.json").write_text( - json.dumps({ - "request_id": rid, - "captured_at": 1000, - "upstream_url": upstream_url, - "body": body, - }) - ) - (capture_dir / f"{rid}.response.json").write_text( - json.dumps({ - "request_id": rid, - "captured_at_resp": 1001, - "stream": False, - "status_code": 200, - "body": response, - "upstream_fingerprint": upstream_fingerprint or {}, - }) - ) - - -_BODY = { - "model": "google/gemma-4-E4B-it", - "messages": [{"role": "user", "content": "u"}], - "tools": [], -} - - -# --------------------------------------------------------------------------- -# Row construction -# --------------------------------------------------------------------------- - - -def test_row_serialises_bodies_as_json_strings(): - row = _row( - request_id="rid", - request_body={"model": "m", "messages": [{"role": "user", "content": "x"}]}, - response_body={"choices": [{"message": {"content": "hi"}}]}, - captured_at=42, - upstream_fp=None, - ) - assert row["request_id"] == "rid" - assert row["model"] == "m" - assert row["captured_at"] == 42 - assert isinstance(row["request"], str) - assert isinstance(row["response"], str) - assert json.loads(row["request"])["messages"][0]["content"] == "x" - assert json.loads(row["response"])["choices"][0]["message"]["content"] == "hi" - - -def test_row_includes_fingerprint_columns_when_present(): - fp = { - "x_served_by": "fireworks-pod-7", - "build_info": "b9039", - "served_model": "qwen-actually-served", - } - row = _row("rid", _BODY, {}, 1, fp) - assert row["served_by"] == "fireworks-pod-7" - assert row["served_build_info"] == "b9039" - assert row["served_model"] == "qwen-actually-served" - - -def test_row_fingerprint_columns_default_to_none(): - row = _row("rid", _BODY, {}, 1, None) - assert row["served_by"] is None - assert row["served_build_info"] is None - assert row["served_model"] is None - - -def test_row_empty_response_serialises_to_empty_object(): - row = _row("rid", _BODY, None, 1, None) - assert row["response"] == "{}" - - -# --------------------------------------------------------------------------- -# detect_model — same uniqueness contract as before -# --------------------------------------------------------------------------- - - -def test_detect_model_returns_unique_model(tmp_path: Path): - capture = tmp_path / "capture" - capture.mkdir() - body = {"model": "google/gemma-4-E4B-it", "messages": []} - _write_capture(capture, "a", body, {"choices": []}) - _write_capture(capture, "b", body, {"choices": []}) - assert detect_model(capture) == "google/gemma-4-E4B-it" - - -def test_detect_model_strips_revision_suffix(tmp_path: Path): - capture = tmp_path / "capture" - capture.mkdir() - _write_capture(capture, "a", {"model": "google/gemma-4-E4B-it", "messages": []}, {}) - _write_capture(capture, "b", {"model": "google/gemma-4-E4B-it@main", "messages": []}, {}) - assert detect_model(capture) == "google/gemma-4-E4B-it" - - -def test_detect_model_raises_on_mixed_models(tmp_path: Path): - capture = tmp_path / "capture" - capture.mkdir() - _write_capture(capture, "a", {"model": "google/gemma-4-E4B-it", "messages": []}, {}) - _write_capture(capture, "b", {"model": "Qwen/Qwen3-7B", "messages": []}, {}) - with pytest.raises(ValueError) as exc_info: - detect_model(capture) - assert "multiple models" in str(exc_info.value) - - -def test_detect_model_returns_none_on_empty_capture_dir(tmp_path: Path): - capture = tmp_path / "capture" - capture.mkdir() - assert detect_model(capture) is None - - -def test_detect_model_returns_none_when_no_request_has_model_field(tmp_path: Path): - capture = tmp_path / "capture" - capture.mkdir() - (capture / "rid.request.json").write_text( - json.dumps({"request_id": "rid", "captured_at": 1, "body": {"messages": []}}) - ) - assert detect_model(capture) is None - - -# --------------------------------------------------------------------------- -# Provider derivation from the per-request upstream_url stamp -# --------------------------------------------------------------------------- - - -def test_detect_provider_columns_hostname_classification(tmp_path: Path): - capture = tmp_path / "capture" - capture.mkdir() - _write_capture( - capture, "rid", _BODY, {}, - upstream_url="http://127.0.0.1:8000", - ) - cols = detect_provider_columns(capture) - assert cols == {"provider": "local", "upstream_url": "http://127.0.0.1:8000"} - - -def test_detect_provider_columns_hf_router_sub_provider_refinement(tmp_path: Path): - capture = tmp_path / "capture" - capture.mkdir() - _write_capture( - capture, "rid", - {"model": "meta-llama/Llama-3.3-70B:fireworks-ai", "messages": []}, - {}, - upstream_url="https://router.huggingface.co", - ) - cols = detect_provider_columns(capture) - assert cols["provider"] == "hf-router/fireworks-ai" - assert cols["upstream_url"] == "https://router.huggingface.co" - - -def test_detect_provider_columns_empty_when_no_upstream_stamp(tmp_path: Path): - capture = tmp_path / "capture" - capture.mkdir() - (capture / "rid.request.json").write_text( - json.dumps({"request_id": "rid", "captured_at": 1, "body": _BODY}) - ) - assert detect_provider_columns(capture) == {} - - -# --------------------------------------------------------------------------- -# Collection-base parsing + repo-id derivation -# --------------------------------------------------------------------------- - - -def test_parse_collection_base_owner_and_base(): - owner, base = parse_collection_base("owner/my-collection") - assert owner == "owner" - assert base == "my-collection" - - -def test_parse_collection_base_strips_hf_datasets_prefix(): - owner, base = parse_collection_base("hf://datasets/owner/base") - assert (owner, base) == ("owner", "base") - - -def test_parse_collection_base_rejects_subdir(): - """A third segment is ambiguous — the collection-base form is a - single ````, not a ``/``.""" - with pytest.raises(ValueError, match="/"): - parse_collection_base("owner/base/extra") - - -def test_parse_collection_base_rejects_missing_name(): - with pytest.raises(ValueError, match="/"): - parse_collection_base("owner") - - -def test_repo_id_derivation(): - assert captures_repo_id("me", "sweep") == "me/sweep-captures" - assert traces_repo_id_for("me", "sweep", "pi") == "me/sweep-pi-traces" - assert traces_repo_id_for("me", "sweep", "hermes") == "me/sweep-hermes-traces" - - -# --------------------------------------------------------------------------- -# push_captures_dataset -# --------------------------------------------------------------------------- - - -def test_push_captures_creates_captures_repo(tmp_path: Path, fake_hf_api): - capture = tmp_path / "capture" - capture.mkdir() - _write_capture(capture, "rid", _BODY, {"choices": []}) - - repo_id, n_rows = push_captures_dataset( - [{"capture_dir": capture, "model": "google/gemma-4-E4B-it", "agent": "pi", - "run_id": "pi-local-20260601-090000"}], - owner="me", base="sweep", - ) - - assert repo_id == "me/sweep-captures" - assert n_rows == [1] - assert fake_hf_api.created_repos[0] == { - "repo_id": "me/sweep-captures", "repo_type": "dataset", - "exist_ok": True, "private": True, - } - - -def test_push_captures_lands_under_data(tmp_path: Path, fake_hf_api): - import re - - capture = tmp_path / "capture" - capture.mkdir() - _write_capture(capture, "rid", _BODY, {"choices": []}) - - push_captures_dataset( - [{"capture_dir": capture, "model": "google/gemma-4-E4B-it", - "agent": "pi", "run_id": "pi-local-20260601-090000"}], - owner="me", base="sweep", - ) - op = fake_hf_api.commits[0]["operations"][0] - # ``-captures`` repo, single ``data/.parquet`` layout. - assert re.fullmatch( - r"data/train-pi-gemma-4-E4B-it-local-\d{8}T\d{6}-[0-9a-f]{6}\.parquet", - op["path_in_repo"], - ), op["path_in_repo"] - - -def test_push_captures_stamps_run_id_column(tmp_path: Path, fake_hf_api): - capture = tmp_path / "capture" - capture.mkdir() - _write_capture(capture, "rid", _BODY, {"choices": []}) - - push_captures_dataset( - [{"capture_dir": capture, "model": "m", "agent": "pi", - "run_id": "pi-local-20260601-090000"}], - owner="me", base="sweep", - ) - op = fake_hf_api.commits[0]["operations"][0] - assert "run_id" in op["columns"] - - -def test_push_captures_batches_into_one_commit(tmp_path: Path, fake_hf_api): - items = [] - for i in range(3): - cap = tmp_path / f"capture-{i}" - cap.mkdir() - _write_capture(cap, f"rid{i}", _BODY, {}) - items.append({ - "capture_dir": cap, "model": "m", "agent": "hermes", - "run_id": f"hermes-local-2026060{i+1}-000000", - }) - - push_captures_dataset(items, owner="me", base="sweep") - assert len(fake_hf_api.commits) == 1 - assert len(fake_hf_api.commits[0]["operations"]) == 3 - paths = [op["path_in_repo"] for op in fake_hf_api.commits[0]["operations"]] - assert len(set(paths)) == 3, f"filenames collided: {paths}" - - -def test_push_captures_seeds_readme_with_collection_link( - tmp_path: Path, fake_hf_api, -): - fake_hf_api.existing_files = [] # simulate freshly-created repo - capture = tmp_path / "capture" - capture.mkdir() - _write_capture(capture, "rid", _BODY, {}) - - push_captures_dataset( - [{"capture_dir": capture, "model": "m", "run_id": "r"}], - owner="me", base="sweep", - ) - - ops = fake_hf_api.commits[0]["operations"] - readme_ops = [op for op in ops if op["path_in_repo"] == "README.md"] - assert readme_ops, "captures README missing on first push" - body = readme_ops[0]["bytes"].decode("utf-8") - # Cross-links to the traces sibling family and the Collection. - assert "me/sweep-captures" in body - assert "sweep--traces" in body - assert "sweep Collection" in body - - -def test_push_captures_skips_readme_on_subsequent_push( - tmp_path: Path, fake_hf_api, -): - # fake_hf_api defaults to existing_files=["README.md"] - capture = tmp_path / "capture" - capture.mkdir() - _write_capture(capture, "rid", _BODY, {}) - - push_captures_dataset( - [{"capture_dir": capture, "model": "m", "run_id": "r"}], - owner="me", base="sweep", - ) - paths = [op["path_in_repo"] for op in fake_hf_api.commits[0]["operations"]] - assert "README.md" not in paths - - -# --------------------------------------------------------------------------- -# push_agent_traces_dataset — raw JSONL upload -# --------------------------------------------------------------------------- - - -def test_push_traces_uploads_files_as_is(tmp_path: Path, fake_hf_api): - fake_hf_api.existing_files = [] - traces = tmp_path / "traces" - traces.mkdir() - (traces / "session-a.jsonl").write_text('{"type":"session","id":"a"}\n') - (traces / "session-b.jsonl").write_text('{"type":"session","id":"b"}\n') - - repo_id, n_files = push_agent_traces_dataset( - [{"traces_dir": traces, "run_id": "pi-local-20260601-090000"}], - owner="me", base="sweep", agent="pi", - ) - - assert repo_id == "me/sweep-pi-traces" - assert n_files == 2 - paths = [op["path_in_repo"] for op in fake_hf_api.commits[0]["operations"]] - # One README + two raw files under data//. - assert "README.md" in paths - assert "data/pi-local-20260601-090000/session-a.jsonl" in paths - assert "data/pi-local-20260601-090000/session-b.jsonl" in paths - - -def test_push_traces_readme_marks_agent_and_links_captures( - tmp_path: Path, fake_hf_api, -): - fake_hf_api.existing_files = [] - traces = tmp_path / "traces" - traces.mkdir() - (traces / "x.jsonl").write_text("{}\n") - - push_agent_traces_dataset( - [{"traces_dir": traces, "run_id": "r1"}], - owner="me", base="sweep", agent="hermes", - ) - ops = fake_hf_api.commits[0]["operations"] - readme = next(op for op in ops if op["path_in_repo"] == "README.md") - body = readme["bytes"].decode("utf-8") - # Tags: agent-traces, agentcap-traces, per-agent suffix. - assert "agent-traces" in body - assert "agentcap-traces-hermes" in body - # source_datasets points back at the captures sibling. - assert "me/sweep-captures" in body - assert "sweep Collection" in body - - -def test_push_traces_skips_when_no_files_and_readme_exists( - tmp_path: Path, fake_hf_api, -): - """Empty trace dir + README already in repo → no commit.""" - traces = tmp_path / "traces" - traces.mkdir() - repo_id, n_files = push_agent_traces_dataset( - [{"traces_dir": traces, "run_id": "r1"}], - owner="me", base="sweep", agent="pi", - ) - assert repo_id == "me/sweep-pi-traces" - assert n_files == 0 - assert fake_hf_api.commits == [] - - -def test_push_traces_repo_created_private(tmp_path: Path, fake_hf_api): - traces = tmp_path / "traces" - traces.mkdir() - (traces / "x.jsonl").write_text("{}") - push_agent_traces_dataset( - [{"traces_dir": traces, "run_id": "r1"}], - owner="me", base="sweep", agent="pi", - ) - record = next( - r for r in fake_hf_api.created_repos if r["repo_id"] == "me/sweep-pi-traces" - ) - assert record["private"] is True - - -# --------------------------------------------------------------------------- -# ensure_collection — find-or-create + idempotent item-add -# --------------------------------------------------------------------------- - - -def test_ensure_collection_creates_when_missing(fake_hf_api): - slug = ensure_collection( - owner="me", base="sweep", - repos=["me/sweep-captures", "me/sweep-pi-traces"], - ) - assert slug.startswith("me/sweep-") - assert len(fake_hf_api.collections_created) == 1 - assert fake_hf_api.collections_created[0]["title"] == "sweep" - assert fake_hf_api.collections_created[0]["private"] is True - item_ids = [it["item_id"] for it in fake_hf_api.collection_items] - assert item_ids == ["me/sweep-captures", "me/sweep-pi-traces"] - - -def test_ensure_collection_is_idempotent_on_second_call(fake_hf_api): - first = ensure_collection( - owner="me", base="sweep", - repos=["me/sweep-captures"], - ) - second = ensure_collection( - owner="me", base="sweep", - repos=["me/sweep-captures", "me/sweep-hermes-traces"], - ) - assert first == second - # Only one collection was created across the two calls. - assert len(fake_hf_api.collections_created) == 1 - - -# --------------------------------------------------------------------------- -# Round-trip — captures parquet shape (incl. run_id column) -# --------------------------------------------------------------------------- - - -def test_export_local_round_trip(tmp_path: Path): - """End-to-end: write captures, export with provider+run_id columns, - read parquet back, assert columns + that request JSON survives - serialisation.""" - import pyarrow.parquet as pq - - capture = tmp_path / "capture" - capture.mkdir() - _write_capture( - capture, "ra", _BODY, {"choices": [{"index": 0}]}, - upstream_fingerprint={"x_served_by": "pod-7", "served_model": "gemma"}, - ) - _write_capture(capture, "rb", _BODY, {"choices": [{"index": 0}]}) - - out = tmp_path / "rows.parquet" - extra_cols = { - "provider": "local", - "upstream_url": "http://127.0.0.1:8000", - "run_id": "pi-local-20260601-090000", - } - n_rows = export_local( - capture, out, progress=False, provider_columns=extra_cols, - ) - assert n_rows == 2 - - table = pq.read_table(out) - assert table.num_rows == 2 - assert set(table.column_names) == { - "request_id", "model", "captured_at", "task_id", "turn", - "request", "response", - "served_by", "served_build_info", "served_model", - "provider", "upstream_url", "run_id", - } - rows = table.to_pylist() - by_rid = {r["request_id"]: r for r in rows} - assert by_rid["ra"]["served_by"] == "pod-7" - assert by_rid["ra"]["served_model"] == "gemma" - assert by_rid["rb"]["served_by"] is None - for r in rows: - assert r["provider"] == "local" - assert r["upstream_url"] == "http://127.0.0.1:8000" - assert r["run_id"] == "pi-local-20260601-090000" - sample = json.loads(by_rid["ra"]["request"]) - assert sample["messages"][0]["role"] == "user" - - -def test_export_local_stamps_agent_and_model_in_schema_metadata(tmp_path): - """``agent`` and ``model`` are written to the parquet's schema-level - KV metadata. ``inspect`` reads them from there instead of parsing - the filename — so it's the canonical labelling source.""" - import pyarrow.parquet as pq - capture = tmp_path / "cap" - capture.mkdir() - _write_capture(capture, "ra", _BODY, {"choices": [{"index": 0}]}) - - out = tmp_path / "rows.parquet" - export_local( - capture, out, progress=False, agent="hermes", model="GLM-4.6", - ) - md = pq.read_schema(out).metadata or {} - assert md.get(b"agent") == b"hermes" - assert md.get(b"model") == b"GLM-4.6" - - -def test_export_local_omits_metadata_when_agent_model_unset(tmp_path): - """Backwards-compat: when callers don't pass ``agent``/``model``, - we don't write empty markers — the parquet just has no KV metadata - and the picker falls back to ``?`` on read.""" - import pyarrow.parquet as pq - capture = tmp_path / "cap" - capture.mkdir() - _write_capture(capture, "ra", _BODY, {"choices": [{"index": 0}]}) - - out = tmp_path / "rows.parquet" - export_local(capture, out, progress=False) - md = pq.read_schema(out).metadata - # ``with_metadata({})`` is never called when both are None, so the - # schema carries no custom metadata at all. - assert md is None or (b"agent" not in md and b"model" not in md) diff --git a/tests/test_followups.py b/tests/test_followups.py deleted file mode 100644 index 7d3f599..0000000 --- a/tests/test_followups.py +++ /dev/null @@ -1,186 +0,0 @@ -"""Tests for the follow-up strategies.""" - -from __future__ import annotations - -import httpx -import pytest - -from agentcap.followups import get_followup -from agentcap.followups.continue_ import ContinueFollowUp -from agentcap.followups.synthesized import SynthesizedFollowUp -from agentcap.followups.synthesized import _default_call_synth -from agentcap.followups.templates import TemplatesFollowUp - - -def test_continue_followup_always_returns_continue(): - fu = ContinueFollowUp() - for turn in (2, 3, 100): - assert ( - fu.next(original_task="anything", last_response="resp", turn=turn) - == "continue" - ) - - -def test_continue_followup_custom_text(): - fu = ContinueFollowUp(text="more") - assert fu.next(original_task="t", last_response="r", turn=2) == "more" - - -def test_templates_followup_rotates_through_pool(): - fu = TemplatesFollowUp(pool=("a", "b", "c")) - seen = [ - fu.next(original_task="t", last_response="r", turn=t) - for t in (2, 3, 4, 5, 6) - ] - assert seen == ["a", "b", "c", "a", "b"] - - -def test_templates_followup_default_pool_is_nonempty(): - fu = TemplatesFollowUp() - out = fu.next(original_task="t", last_response="r", turn=2) - assert isinstance(out, str) and out - - -def test_templates_followup_rejects_empty_pool(): - with pytest.raises(ValueError): - TemplatesFollowUp(pool=()) - - -def test_synthesized_followup_calls_synth_with_prompt(): - captured: dict = {} - - def fake_call(*, upstream, model, prompt, timeout, api_key=None): - captured["upstream"] = upstream - captured["model"] = model - captured["prompt"] = prompt - captured["timeout"] = timeout - captured["api_key"] = api_key - return " Show me the migration plan. " - - fu = SynthesizedFollowUp( - upstream="http://synth:9000", - model="synth-model", - call=fake_call, - timeout=10, - ) - out = fu.next( - original_task="Plan the S3 backend.", - last_response="Here's a draft plan.", - turn=2, - ) - assert out == "Show me the migration plan." - assert captured["upstream"] == "http://synth:9000" - assert captured["model"] == "synth-model" - assert captured["timeout"] == 10 - # Prompt embeds task and response - assert "Plan the S3 backend." in captured["prompt"] - assert "Here's a draft plan." in captured["prompt"] - - -def test_synthesized_followup_falls_back_on_exception(capsys): - def boom(**_): - raise RuntimeError("synth down") - - fu = SynthesizedFollowUp( - upstream="http://synth", model="m", call=boom, fallback="continue" - ) - assert fu.next(original_task="t", last_response="r", turn=2) == "continue" - # Fallback must be noisy — silence here used to mask 401s against - # authenticated upstreams while run.json kept claiming - # ``followup: synthesized``. - err = capsys.readouterr().err - assert "synthesized turn=2 fell back to 'continue'" in err - assert "RuntimeError" in err and "synth down" in err - - -def test_synthesized_followup_falls_back_on_empty_response(): - fu = SynthesizedFollowUp( - upstream="http://synth", - model="m", - call=lambda **_: " ", - fallback="keep going", - ) - assert fu.next(original_task="t", last_response="r", turn=2) == "keep going" - - -def test_get_followup_dispatch(): - assert isinstance(get_followup("continue"), ContinueFollowUp) - assert isinstance(get_followup("templates"), TemplatesFollowUp) - # synthesized requires upstream/model kwargs - fu = get_followup( - "synthesized", upstream="http://x", model="m", call=lambda **_: "ok" - ) - assert isinstance(fu, SynthesizedFollowUp) - - -def test_get_followup_unknown(): - with pytest.raises(ValueError): - get_followup("not-a-strategy") - - -def test_default_call_synth_accepts_upstream_with_v1_suffix(monkeypatch): - class _Resp: - def raise_for_status(self): - return None - - def json(self): - return {"choices": [{"message": {"content": "ok"}}]} - - captured = {} - - def fake_post(url, json, timeout, headers=None): - captured["url"] = url - captured["headers"] = headers - return _Resp() - - monkeypatch.setattr(httpx, "post", fake_post) - - out = _default_call_synth( - upstream="https://router.huggingface.co/v1", - model="Qwen/Qwen3-8B", - prompt="p", - timeout=5, - ) - assert out == "ok" - assert captured["url"] == "https://router.huggingface.co/v1/chat/completions" - assert captured["headers"] is None # no api_key => no Authorization - - -def test_default_call_synth_sends_bearer_when_api_key_given(monkeypatch): - class _Resp: - def raise_for_status(self): - return None - - def json(self): - return {"choices": [{"message": {"content": "ok"}}]} - - captured = {} - - def fake_post(url, json, timeout, headers=None): - captured["headers"] = headers - return _Resp() - - monkeypatch.setattr(httpx, "post", fake_post) - - _default_call_synth( - upstream="https://router.huggingface.co", - model="m", - prompt="p", - timeout=5, - api_key="hf_xyz", - ) - assert captured["headers"] == {"Authorization": "Bearer hf_xyz"} - - -def test_synthesized_followup_passes_api_key_to_call(): - seen = {} - - def fake(*, upstream, model, prompt, timeout, api_key): - seen["api_key"] = api_key - return "next" - - fu = SynthesizedFollowUp( - upstream="http://synth", model="m", call=fake, api_key="hf_abc" - ) - fu.next(original_task="t", last_response="r", turn=2) - assert seen["api_key"] == "hf_abc" diff --git a/tests/test_inspect_helpers.py b/tests/test_inspect_helpers.py deleted file mode 100644 index c46ad07..0000000 --- a/tests/test_inspect_helpers.py +++ /dev/null @@ -1,158 +0,0 @@ -"""Unit tests for the inspect picker's parsing helpers. - -Covers ``_decode_sse_response`` (OpenAI-compatible SSE → synthesized -assistant message) and ``_parse_fzf_terms`` (fzf query → list of -substrings to highlight). These functions are pure and live behind the -interactive picker, so they're easy to drift on without notice. -""" - -from __future__ import annotations - -import json - -from agentcap.__main__ import _decode_sse_response -from agentcap.__main__ import _parse_fzf_terms - - -def _sse(*objs) -> str: - """Assemble an SSE blob: one ``data: `` line per object, - plus a trailing ``data: [DONE]`` marker like real servers send.""" - return ( - "\n".join(f"data: {json.dumps(o)}" for o in objs) - + "\ndata: [DONE]\n" - ) - - -def test_decode_sse_empty_returns_empty_message(): - out = _decode_sse_response("") - assert out == {"content": "", "tool_calls": [], "finish_reason": None} - - -def test_decode_sse_concatenates_content_chunks(): - raw = _sse( - {"choices": [{"delta": {"content": "Hello"}}]}, - {"choices": [{"delta": {"content": ", "}}]}, - {"choices": [{"delta": {"content": "world!"}}]}, - {"choices": [{"delta": {}, "finish_reason": "stop"}]}, - ) - out = _decode_sse_response(raw) - assert out["content"] == "Hello, world!" - assert out["tool_calls"] == [] - assert out["finish_reason"] == "stop" - - -def test_decode_sse_merges_tool_call_argument_fragments(): - # First chunk for a tool call carries id + function.name; later - # chunks accumulate ``arguments`` fragments under the same index. - raw = _sse( - {"choices": [{"delta": {"tool_calls": [{ - "index": 0, "id": "call_1", "type": "function", - "function": {"name": "read", "arguments": ""}, - }]}}]}, - {"choices": [{"delta": {"tool_calls": [{ - "index": 0, "function": {"arguments": '{"path"'}, - }]}}]}, - {"choices": [{"delta": {"tool_calls": [{ - "index": 0, "function": {"arguments": ': "a.py"}'}, - }]}}]}, - {"choices": [{"delta": {}, "finish_reason": "tool_calls"}]}, - ) - out = _decode_sse_response(raw) - assert out["content"] == "" - assert out["tool_calls"] == [{ - "id": "call_1", "type": "function", - "function": {"name": "read", "arguments": '{"path": "a.py"}'}, - }] - assert out["finish_reason"] == "tool_calls" - - -def test_decode_sse_keeps_multiple_tool_calls_in_index_order(): - # Two parallel tool calls — index 1's first chunk arrives before - # index 0's last; the decoder must still emit them sorted by index. - raw = _sse( - {"choices": [{"delta": {"tool_calls": [{ - "index": 0, "id": "c0", - "function": {"name": "first", "arguments": "{"}, - }]}}]}, - {"choices": [{"delta": {"tool_calls": [{ - "index": 1, "id": "c1", - "function": {"name": "second", "arguments": "{}"}, - }]}}]}, - {"choices": [{"delta": {"tool_calls": [{ - "index": 0, "function": {"arguments": "}"}, - }]}}]}, - ) - out = _decode_sse_response(raw) - names = [tc["function"]["name"] for tc in out["tool_calls"]] - ids = [tc["id"] for tc in out["tool_calls"]] - args = [tc["function"]["arguments"] for tc in out["tool_calls"]] - assert names == ["first", "second"] - assert ids == ["c0", "c1"] - assert args == ["{}", "{}"] - - -def test_decode_sse_skips_malformed_json_lines(): - # A garbled chunk in the middle must not abort the whole stream. - raw = ( - 'data: {"choices":[{"delta":{"content":"ok"}}]}\n' - "data: {not json\n" - 'data: {"choices":[{"delta":{"content":"!"}}]}\n' - "data: [DONE]\n" - ) - out = _decode_sse_response(raw) - assert out["content"] == "ok!" - - -def test_decode_sse_ignores_non_data_and_blank_lines(): - # Real streams interleave keep-alive comments (``: ping``) and - # blank separators between events. - raw = ( - ": keepalive\n" - "\n" - 'data: {"choices":[{"delta":{"content":"x"}}]}\n' - "\n" - "event: end\n" - 'data: {"choices":[{"delta":{},"finish_reason":"stop"}]}\n' - "data: [DONE]\n" - ) - out = _decode_sse_response(raw) - assert out["content"] == "x" - assert out["finish_reason"] == "stop" - - -def test_parse_fzf_terms_empty_query_returns_empty_list(): - assert _parse_fzf_terms("") == [] - assert _parse_fzf_terms(" ") == [] - - -def test_parse_fzf_terms_plain_words(): - assert _parse_fzf_terms("alpha beta") == ["alpha", "beta"] - - -def test_parse_fzf_terms_strips_exact_match_quote(): - # ``'word`` → exact-match in fzf; the leading quote is a fzf - # operator, not part of the substring to highlight. - assert _parse_fzf_terms("'hf-cli") == ["hf-cli"] - - -def test_parse_fzf_terms_strips_anchors(): - # ``^`` (prefix) and ``$`` (suffix) are fzf anchors — neither is - # part of the substring being matched. - assert _parse_fzf_terms("^foo") == ["foo"] - assert _parse_fzf_terms("bar$") == ["bar"] - assert _parse_fzf_terms("^baz$") == ["baz"] - - -def test_parse_fzf_terms_drops_negated_terms(): - # ``!word`` excludes matches in fzf — nothing to colour for it. - assert _parse_fzf_terms("keep !drop also") == ["keep", "also"] - - -def test_parse_fzf_terms_drops_bare_or_separator(): - # A bare ``|`` between two terms is fzf's OR — not a substring. - assert _parse_fzf_terms("a | b") == ["a", "b"] - - -def test_parse_fzf_terms_handles_mixed_operators(): - out = _parse_fzf_terms("'exact ^prefix suffix$ !nope plain") - assert out == ["exact", "prefix", "suffix", "plain"] diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py deleted file mode 100644 index f043a34..0000000 --- a/tests/test_orchestrator.py +++ /dev/null @@ -1,292 +0,0 @@ -"""Tests for the orchestrator.""" - -from __future__ import annotations - -from pathlib import Path - -import pytest - -from agentcap.drivers import AgentDriver, AgentTurn -from agentcap.followups.continue_ import ContinueFollowUp -from agentcap.followups.templates import TemplatesFollowUp -from agentcap.orchestrator import Orchestrator, read_tasks_txt - - -# --------------------------------------------------------------------------- -# Fake driver -# --------------------------------------------------------------------------- - - -class FakeDriver(AgentDriver): - """Records every call; returns scripted AgentTurns.""" - - name = "fake" - - def __init__( - self, - *, - start_turn: AgentTurn | None = None, - resume_turn: AgentTurn | None = None, - resume_unsupported: bool = False, - ) -> None: - self.calls: list[tuple[str, str, str | None]] = [] # (op, prompt, sid) - self._start_turn = start_turn or AgentTurn( - session_id="ses_fake", response_text="initial response", returncode=0, - stdout="hi", stderr="", - ) - self._resume_turn = resume_turn or AgentTurn( - session_id="ses_fake", response_text="continuation", returncode=0, - stdout="ok", stderr="", - ) - self._resume_unsupported = resume_unsupported - - def start(self, prompt, *, env=None, timeout=None): - self.calls.append(("start", prompt, None)) - return self._start_turn - - def resume(self, prompt, *, session_id, env=None, timeout=None): - self.calls.append(("resume", prompt, session_id)) - if self._resume_unsupported: - raise NotImplementedError("fake doesn't resume") - return self._resume_turn - - -# --------------------------------------------------------------------------- -# read_tasks_txt -# --------------------------------------------------------------------------- - - -def test_read_tasks_skips_comments_and_blanks(tmp_path: Path): - p = tmp_path / "tasks.txt" - p.write_text( - "# header comment\n" - "first task\n" - "\n" - " # indented comment\n" - "second task\n" - " third task with leading space\n" - ) - tasks = read_tasks_txt(p) - assert tasks == ["first task", "second task", "third task with leading space"] - - -def test_read_tasks_empty_file(tmp_path: Path): - p = tmp_path / "tasks.txt" - p.write_text("# only comments\n\n") - assert read_tasks_txt(p) == [] - - -# --------------------------------------------------------------------------- -# Orchestrator.run_task -# --------------------------------------------------------------------------- - - -def test_run_task_single_turn_no_followup_call(): - drv = FakeDriver() - fu = ContinueFollowUp() - orch = Orchestrator(drv, fu) - - result = orch.run_task("Plan the S3 backend", task_id="t01", turns=1) - assert len(result.turns) == 1 - assert result.turns[0].turn == 1 - assert result.turns[0].prompt == "Plan the S3 backend" - assert result.session_id == "ses_fake" - # Driver was called once for start, never for resume - assert [c[0] for c in drv.calls] == ["start"] - - -def test_run_task_multi_turn_uses_continue_followup(): - drv = FakeDriver() - orch = Orchestrator(drv, ContinueFollowUp()) - result = orch.run_task("task", task_id="t01", turns=4) - assert len(result.turns) == 4 - assert [c[0] for c in drv.calls] == ["start", "resume", "resume", "resume"] - # All resume prompts are "continue" - for op, prompt, sid in drv.calls[1:]: - assert prompt == "continue" - assert sid == "ses_fake" - - -def test_run_task_multi_turn_uses_templates_pool(): - drv = FakeDriver() - pool = ("first", "second", "third") - orch = Orchestrator(drv, TemplatesFollowUp(pool=pool)) - orch.run_task("task", task_id="t01", turns=4) - # Skip the start call; resume prompts cycle through pool - resume_prompts = [p for op, p, _ in drv.calls if op == "resume"] - assert resume_prompts == list(pool) - - -def test_run_task_aborts_when_initial_returncode_nonzero(): - drv = FakeDriver( - start_turn=AgentTurn( - session_id=None, response_text="", returncode=1, stdout="", stderr="boom" - ) - ) - orch = Orchestrator(drv, ContinueFollowUp()) - result = orch.run_task("task", task_id="t01", turns=3) - assert len(result.turns) == 1 - assert result.completed_turns == 0 - # No resume calls were made - assert all(c[0] == "start" for c in drv.calls) - - -def test_run_task_aborts_when_no_session_id_for_multi_turn(): - drv = FakeDriver( - start_turn=AgentTurn( - session_id=None, response_text="hi", returncode=0, stdout="", stderr="" - ) - ) - orch = Orchestrator(drv, ContinueFollowUp()) - result = orch.run_task("task", task_id="t01", turns=3) - assert len(result.turns) == 1 - # Only the start call; resume never happens because session_id is None - assert all(c[0] == "start" for c in drv.calls) - - -def test_run_task_breaks_loop_on_resume_failure(): - drv = FakeDriver( - resume_turn=AgentTurn( - session_id="ses_fake", response_text="", returncode=124, - stdout="", stderr="timeout", - ) - ) - orch = Orchestrator(drv, ContinueFollowUp()) - result = orch.run_task("task", task_id="t01", turns=4) - # One success + one failure, then the loop breaks - assert len(result.turns) == 2 - assert result.turns[1].returncode == 124 - - -def test_run_task_handles_resume_not_implemented(): - drv = FakeDriver(resume_unsupported=True) - orch = Orchestrator(drv, ContinueFollowUp()) - result = orch.run_task("task", task_id="t01", turns=3) - # First turn succeeds; resume raises NotImplementedError; orchestrator stops - assert len(result.turns) == 1 - - -def test_run_task_rejects_zero_turns(): - drv = FakeDriver() - orch = Orchestrator(drv, ContinueFollowUp()) - with pytest.raises(ValueError): - orch.run_task("t", task_id="x", turns=0) - - -def test_run_task_writes_session_logs_when_sessions_dir_set(tmp_path: Path): - drv = FakeDriver( - start_turn=AgentTurn( - session_id="s1", response_text="r", returncode=0, - stdout="STDOUT-init", stderr="STDERR-init", - ), - resume_turn=AgentTurn( - session_id="s1", response_text="r2", returncode=0, - stdout="STDOUT-cont", stderr="STDERR-cont", - ), - ) - sessions = tmp_path / "sessions" - orch = Orchestrator(drv, ContinueFollowUp(), sessions_dir=sessions) - orch.run_task("t", task_id="task_01", turns=2) - - assert (sessions / "task_01_turn_01.out").read_text() == "STDOUT-init" - assert (sessions / "task_01_turn_01.err").read_text() == "STDERR-init" - assert (sessions / "task_01_turn_02.out").read_text() == "STDOUT-cont" - assert (sessions / "task_01_turn_02.err").read_text() == "STDERR-cont" - - -# --------------------------------------------------------------------------- -# Orchestrator.run_corpus -# --------------------------------------------------------------------------- - - -def test_run_corpus_iterates_tasks_with_default_id_format(): - drv = FakeDriver() - orch = Orchestrator(drv, ContinueFollowUp()) - results = orch.run_corpus( - ["task A", "task B", "task C"], turns_per_task=1 - ) - assert [r.task_id for r in results] == ["task_01", "task_02", "task_03"] - assert [r.prompt for r in results] == ["task A", "task B", "task C"] - - -def test_run_corpus_records_events(): - drv = FakeDriver() - events: list[tuple[str, dict]] = [] - - def listener(event: str, **kw): - events.append((event, kw)) - - orch = Orchestrator(drv, ContinueFollowUp(), on_event=listener) - orch.run_corpus(["task A"], turns_per_task=2) - event_names = [e for e, _ in events] - assert event_names[0] == "task_start" - assert event_names.count("turn_done") == 2 - - -def _timeout_after_n(n: int): - """Return a driver whose start/resume raises TimeoutExpired on the - n-th call (1-indexed), succeeds otherwise.""" - import subprocess - - class TimeoutDriver(FakeDriver): - def __init__(self): - super().__init__() - self._n = 0 - - def start(self, prompt, *, env=None, timeout=None): - self._n += 1 - if self._n == n: - raise subprocess.TimeoutExpired(["fake"], timeout or 1) - return super().start(prompt, env=env, timeout=timeout) - - def resume(self, prompt, *, session_id, env=None, timeout=None): - self._n += 1 - if self._n == n: - raise subprocess.TimeoutExpired(["fake"], timeout or 1) - return super().resume(prompt, session_id=session_id, env=env, timeout=timeout) - - return TimeoutDriver() - - -def test_run_task_aborts_on_initial_turn_timeout(): - """A driver timeout on turn 1 must not propagate; the task is - aborted with a recorded event and ``run_corpus`` keeps going.""" - drv = _timeout_after_n(1) - events: list[tuple[str, dict]] = [] - orch = Orchestrator( - drv, ContinueFollowUp(), on_event=lambda **kw: events.append((kw.pop("event"), kw)) - ) - result = orch.run_task("anything", task_id="t01", turns=2) - assert result.turns == [] - aborted = [e for e in events if e[0] == "task_aborted"] - assert aborted and aborted[0][1]["reason"] == "initial-turn-timeout" - - -def test_run_corpus_keeps_going_when_one_task_times_out(): - """Critical: a timeout on task 1 must not kill tasks 2+.""" - # Total calls across the run: t1 start (timeout), t2 start (ok), - # t3 start (ok). Trip the 1st call only. - drv = _timeout_after_n(1) - orch = Orchestrator(drv, ContinueFollowUp()) - results = orch.run_corpus( - ["task A", "task B", "task C"], turns_per_task=1 - ) - assert len(results) == 3 - # task A failed before any turn could be recorded - assert results[0].turns == [] - # tasks B and C completed turn 1 - assert len(results[1].turns) == 1 - assert len(results[2].turns) == 1 - - -def test_run_task_aborts_on_followup_turn_timeout(): - drv = _timeout_after_n(2) # 1st call ok (start), 2nd (resume) times out - events: list[tuple[str, dict]] = [] - orch = Orchestrator( - drv, ContinueFollowUp(), on_event=lambda **kw: events.append((kw.pop("event"), kw)) - ) - result = orch.run_task("anything", task_id="t01", turns=3) - # Only turn 1 recorded. - assert len(result.turns) == 1 - aborted = [e for e in events if e[0] == "task_aborted"] - assert aborted and aborted[0][1]["reason"] == "follow-up-turn-timeout" diff --git a/tests/test_podman_sandbox.py b/tests/test_podman_sandbox.py deleted file mode 100644 index 6a67fc1..0000000 --- a/tests/test_podman_sandbox.py +++ /dev/null @@ -1,229 +0,0 @@ -"""Structural tests for :mod:`agentcap.sandbox.podman`. - -Argv-assembly only — these don't shell out to podman. End-to-end -coverage against a real ``podman run`` lives in -``tests/test_drivers_live.py`` via the live driver tests. -""" - -from __future__ import annotations - -from agentcap.sandbox import Sandbox -from agentcap.sandbox.podman import PodmanSandbox, build_command - - -def test_podman_sandbox_implements_protocol(): - assert isinstance(PodmanSandbox(image="agentcap-goose:latest"), Sandbox) - - -def test_build_command_minimal(): - cmd = build_command( - ["echo", "hi"], - image="img:latest", - writable_paths=[], - ) - assert cmd[:3] == ["podman", "run", "--rm"] - assert cmd[-3:] == ["img:latest", "echo", "hi"] - - -def test_build_command_writable_bind_mount(tmp_path): - cmd = build_command( - ["true"], - image="img:latest", - writable_paths=[tmp_path], - ) - expected = f"type=bind,src={tmp_path.resolve()},dst={tmp_path.resolve()}" - assert "--mount" in cmd - assert expected in cmd - - -def test_build_command_readonly_bind_mount(tmp_path): - cmd = build_command( - ["true"], - image="img:latest", - writable_paths=[], - readonly_paths=[tmp_path], - ) - expected = ( - f"type=bind,src={tmp_path.resolve()},dst={tmp_path.resolve()},ro" - ) - assert expected in cmd - - -def test_build_command_deny_network(): - cmd = build_command( - ["true"], image="img:latest", writable_paths=[], - deny_network=True, - ) - assert "--network=none" in cmd - - -def test_build_command_propagates_env(): - cmd = build_command( - ["true"], image="img:latest", writable_paths=[], - env={"FOO": "bar"}, - ) - assert "-e" in cmd - assert "FOO=bar" in cmd - - -def test_build_command_propagates_cwd(tmp_path): - cmd = build_command( - ["true"], image="img:latest", writable_paths=[], - cwd=str(tmp_path), - ) - assert "--workdir" in cmd - assert str(tmp_path) in cmd - # ``cwd`` is also added to the writable bind set so chdir - # resolves inside the container. - expected = f"type=bind,src={tmp_path.resolve()},dst={tmp_path.resolve()}" - assert expected in cmd - - -def test_build_command_dedups_overlapping_mounts(tmp_path): - cmd = build_command( - ["true"], image="img:latest", - writable_paths=[tmp_path, tmp_path], - readonly_paths=[tmp_path], - ) - mount_args = [a for a in cmd if a.startswith("type=bind,")] - assert len(mount_args) == 1 - - -def test_wrap_layers_constructor_env_under_call_env(tmp_path): - sb = PodmanSandbox(image="img:latest", env={"A": "1", "B": "2"}) - cmd = sb.wrap(["true"], writable_paths=[], env={"B": "override"}) - assert "A=1" in cmd - assert "B=override" in cmd - assert "B=2" not in cmd - - -def test_wrap_combines_lifetime_and_per_call_writable_paths(tmp_path): - lifetime = tmp_path / "lifetime" - lifetime.mkdir() - per_call = tmp_path / "percall" - per_call.mkdir() - sb = PodmanSandbox(image="img:latest", writable_paths=[lifetime]) - cmd = sb.wrap(["true"], writable_paths=[per_call]) - assert f"type=bind,src={lifetime.resolve()},dst={lifetime.resolve()}" in cmd - assert f"type=bind,src={per_call.resolve()},dst={per_call.resolve()}" in cmd - - -def test_close_is_noop(): - sb = PodmanSandbox(image="img:latest") - sb.close() - sb.close() - - -def test_context_manager_closes(): - with PodmanSandbox(image="img:latest") as sb: - assert sb.image == "img:latest" - - -def test_run_names_container_and_force_removes_it(monkeypatch): - """Every ``run()`` must inject ``--name agentcap-`` and, in a - ``finally`` block, fire ``podman rm -f `` even when the - main subprocess succeeded — ``--rm`` only fires on a clean container - exit, so this is the guarantee against orphaned containers when - timeouts/kills/dead parents prevent that.""" - import subprocess - from agentcap.sandbox import podman as pmod - - calls: list[list[str]] = [] - - class _Completed: - returncode = 0 - stdout = "" - stderr = "" - - def fake_run(argv, **_kw): - calls.append(list(argv)) - return _Completed() - - monkeypatch.setattr(subprocess, "run", fake_run) - sb = pmod.PodmanSandbox(image="img:latest") - sb.run(["echo", "hi"]) - - assert len(calls) == 2, f"expected run + rm; got {calls!r}" - run_cmd, rm_cmd = calls - # ``--name `` was inserted right after ``podman run``. - assert "--name" in run_cmd - name_idx = run_cmd.index("--name") - name = run_cmd[name_idx + 1] - assert name.startswith("agentcap-") - # The cleanup targets the same name. - assert rm_cmd[:3] == ["podman", "rm", "-f"] - assert rm_cmd[3] == name - - -def test_run_force_removes_container_even_if_subprocess_raises(monkeypatch): - """When ``subprocess.run`` raises (e.g. ``TimeoutExpired``), the - container can still be alive — the cleanup ``podman rm -f`` must - fire from the ``finally`` so the orchestrator never leaks a - container even on timeout / SIGINT. The cleanup must also target - the SAME name that ``podman run`` was given; removing the wrong - container would silently nuke something else.""" - import subprocess - from agentcap.sandbox import podman as pmod - - run_calls: list[list[str]] = [] - rm_calls: list[list[str]] = [] - - def fake_run(argv, **kw): - if argv[:3] == ["podman", "rm", "-f"]: - rm_calls.append(list(argv)) - class _R: - returncode = 0 - stdout = "" - stderr = "" - return _R() - run_calls.append(list(argv)) - raise subprocess.TimeoutExpired(cmd=argv, timeout=kw.get("timeout")) - - monkeypatch.setattr(subprocess, "run", fake_run) - sb = pmod.PodmanSandbox(image="img:latest") - try: - sb.run(["sleep", "60"], timeout=0.01) - except subprocess.TimeoutExpired: - pass - else: - raise AssertionError("expected TimeoutExpired to propagate") - - # Extract the name passed to ``podman run`` via ``--name ``. - assert len(run_calls) == 1, run_calls - run_argv = run_calls[0] - assert "--name" in run_argv - run_name = run_argv[run_argv.index("--name") + 1] - assert run_name.startswith("agentcap-") - - # Cleanup must have targeted that exact name — not some other - # container, and not no container. - assert len(rm_calls) == 1, rm_calls - assert rm_calls[0] == ["podman", "rm", "-f", run_name] - - -def test_run_propagates_main_failure_when_cleanup_also_fails(monkeypatch): - """If both the main ``podman run`` and the cleanup ``podman rm -f`` - raise, callers must see the ORIGINAL exception — not the cleanup's. - Otherwise a transient ``rm`` failure would mask the real reason the - container run failed (timeout, exit code, etc.).""" - import subprocess - from agentcap.sandbox import podman as pmod - - def fake_run(argv, **kw): - if argv[:3] == ["podman", "rm", "-f"]: - raise RuntimeError("cleanup boom") - raise subprocess.TimeoutExpired(cmd=argv, timeout=kw.get("timeout")) - - monkeypatch.setattr(subprocess, "run", fake_run) - sb = pmod.PodmanSandbox(image="img:latest") - try: - sb.run(["sleep", "60"], timeout=0.01) - except subprocess.TimeoutExpired: - pass # original exception preserved - except RuntimeError as exc: - raise AssertionError( - f"cleanup exception ({exc}) leaked past the finally — " - f"primary TimeoutExpired was masked" - ) - else: - raise AssertionError("expected TimeoutExpired to propagate") diff --git a/tests/test_provider.py b/tests/test_provider.py deleted file mode 100644 index 1e02bfb..0000000 --- a/tests/test_provider.py +++ /dev/null @@ -1,108 +0,0 @@ -"""Pure-Python tests for ``agentcap.provider`` — classifier + hostname -fallback + HF Router sub-provider refinement. The actual network probe -is tested implicitly via the live integration suite; here we feed -synthetic ``endpoints`` dicts to exercise the classification logic.""" - -from __future__ import annotations - -from agentcap.provider import ( - _classify, - _hostname_fallback, - refine_for_sub_provider, -) - - -# --------------------------------------------------------------------------- -# hostname fallback -# --------------------------------------------------------------------------- - - -def test_hostname_fallback_known_providers(): - assert _hostname_fallback("https://router.huggingface.co/v1") == "hf-router" - assert _hostname_fallback("https://api.openai.com/v1") == "openai" - assert _hostname_fallback("https://api.together.xyz/v1") == "together" - assert _hostname_fallback("https://api.fireworks.ai/v1") == "fireworks" - - -def test_hostname_fallback_loopback_and_private(): - assert _hostname_fallback("http://127.0.0.1:8000/v1") == "local" - assert _hostname_fallback("http://localhost:8000/v1") == "local" - assert _hostname_fallback("http://10.0.0.5:8000/v1") == "local" - assert _hostname_fallback("http://192.168.1.42:8000/v1") == "local" - - -def test_hostname_fallback_unknown_public(): - # eTLD+1-style: api.mycompany.com → "mycompany" - assert _hostname_fallback("https://api.mycompany.com/v1") == "mycompany" - - -# --------------------------------------------------------------------------- -# classifier -# --------------------------------------------------------------------------- - - -def test_classify_hf_router_via_colon_suffix(): - endpoints = { - "models": {"body": {"data": [ - {"id": "meta-llama/Llama-3.3-70B-Instruct"}, - {"id": "meta-llama/Llama-3.3-70B-Instruct:fireworks-ai"}, - ]}}, - } - assert _classify(endpoints, "https://router.huggingface.co/v1") == "hf-router" - - -def test_classify_llama_cpp_via_props(): - endpoints = { - "props": {"body": {"chat_template": "...", "n_ctx": 65536}}, - "models": {"body": {"data": [{"id": "qwen-test"}]}}, - } - assert _classify(endpoints, "http://127.0.0.1:8000/v1") == "local-llama-server" - - -def test_classify_tgi_via_info_model_id(): - endpoints = { - "info": {"body": {"model_id": "meta-llama/Llama-3.3-70B-Instruct", - "version": "2.4.1"}}, - } - assert _classify(endpoints, "http://10.0.0.5:8000/v1") == "tgi" - - -def test_classify_vllm_via_version(): - endpoints = { - "version": {"body": {"version": "0.7.0"}}, - "models": {"body": {"data": [{"id": "served-model"}]}}, - } - assert _classify(endpoints, "http://10.0.0.5:8000/v1") == "vllm" - - -def test_classify_openai_via_model_ids(): - endpoints = { - "models": {"body": {"data": [ - {"id": "gpt-4o-mini"}, - {"id": "o1-preview"}, - ]}}, - } - assert _classify(endpoints, "https://api.openai.com/v1") == "openai" - - -def test_classify_falls_back_to_hostname_when_probe_empty(): - assert _classify({}, "https://router.huggingface.co/v1") == "hf-router" - assert _classify({}, "http://127.0.0.1:8000/v1") == "local" - - -# --------------------------------------------------------------------------- -# refine_for_sub_provider -# --------------------------------------------------------------------------- - - -def test_refine_pins_hf_router_sub_provider(): - assert refine_for_sub_provider( - "hf-router", "meta-llama/Llama-3.3-70B-Instruct:fireworks-ai" - ) == "hf-router/fireworks-ai" - - -def test_refine_noop_without_colon_or_non_hf_router(): - assert refine_for_sub_provider("hf-router", "meta-llama/Llama-3.3-70B") == "hf-router" - assert refine_for_sub_provider("local", "anything:fireworks-ai") == "local" - - diff --git a/tests/test_proxy.py b/tests/test_proxy.py deleted file mode 100644 index bca9ca7..0000000 --- a/tests/test_proxy.py +++ /dev/null @@ -1,327 +0,0 @@ -"""Tests for the capture proxy. - -Strategy: stand up a mock upstream Starlette app and wire the proxy's -internal httpx client to it via ``ASGITransport``. Then drive the -proxy via Starlette's ``TestClient`` and assert on (a) what bytes the -agent-side client sees, and (b) what files land on disk in the capture -dir. - -End-to-end network sockets are not used — everything runs in-process. -""" - -from __future__ import annotations - -import json -from pathlib import Path - -import httpx -import pytest -from starlette.applications import Starlette -from starlette.requests import Request -from starlette.responses import JSONResponse, Response, StreamingResponse -from starlette.routing import Route -from starlette.testclient import TestClient - -from agentcap.proxy import CHAT_COMPLETIONS_PATH, make_app - - -# --------------------------------------------------------------------------- -# Mock upstream — a tiny Starlette app that pretends to be an OpenAI-compat -# model server. Each test parameterises its behaviour by setting attributes -# on the wrapping ``Holder``. -# --------------------------------------------------------------------------- - - -class UpstreamSpy: - """Records what the proxy forwarded to upstream + lets each test - plug in a custom response factory.""" - - def __init__(self) -> None: - self.received_bodies: list[dict] = [] - self.received_headers: list[dict] = [] - self.received_paths: list[str] = [] - self.responder = None # async callable: (request) -> Response - - def set_responder(self, fn) -> None: - self.responder = fn - - -def _build_upstream(spy: UpstreamSpy) -> Starlette: - async def chat_handler(request: Request) -> Response: - body = await request.body() - try: - spy.received_bodies.append(json.loads(body)) - except json.JSONDecodeError: - spy.received_bodies.append({"_unparsed": body.decode("utf-8", errors="replace")}) - spy.received_headers.append(dict(request.headers)) - spy.received_paths.append(request.url.path) - if spy.responder is None: - return JSONResponse({"error": "no responder configured"}, status_code=500) - return await spy.responder(request) - - async def models_handler(request: Request) -> Response: - spy.received_paths.append(request.url.path) - return JSONResponse( - {"object": "list", "data": [{"id": "mock-model", "object": "model"}]} - ) - - async def echo_handler(request: Request) -> Response: - spy.received_paths.append(request.url.path) - return JSONResponse({"path": request.url.path, "method": request.method}) - - return Starlette( - routes=[ - Route(CHAT_COMPLETIONS_PATH, chat_handler, methods=["POST"]), - Route("/v1/models", models_handler, methods=["GET"]), - Route( - "/{anything:path}", - echo_handler, - methods=["GET", "POST", "PUT", "DELETE"], - ), - ] - ) - - -@pytest.fixture -def spy() -> UpstreamSpy: - return UpstreamSpy() - - -@pytest.fixture -def capture_dir(tmp_path: Path) -> Path: - d = tmp_path / "capture" - d.mkdir() - return d - - -@pytest.fixture -def proxy_client(spy: UpstreamSpy, capture_dir: Path): - """A TestClient hitting the proxy, where the proxy talks to the - mock upstream via ASGITransport.""" - upstream_app = _build_upstream(spy) - upstream_transport = httpx.ASGITransport(app=upstream_app) - upstream_client = httpx.AsyncClient( - transport=upstream_transport, base_url="http://upstream" - ) - proxy_app = make_app("http://upstream", capture_dir, client=upstream_client) - with TestClient(proxy_app) as client: - yield client - - -# --------------------------------------------------------------------------- -# Tests -# --------------------------------------------------------------------------- - - -def test_chat_nonstreaming_captures_request_and_response( - spy: UpstreamSpy, capture_dir: Path, proxy_client: TestClient -): - async def responder(request): - return JSONResponse( - { - "id": "chatcmpl-test", - "object": "chat.completion", - "choices": [ - { - "index": 0, - "message": {"role": "assistant", "content": "hi back"}, - "finish_reason": "stop", - } - ], - } - ) - - spy.set_responder(responder) - body = { - "model": "test-model", - "messages": [{"role": "user", "content": "hello"}], - "stream": False, - } - resp = proxy_client.post(CHAT_COMPLETIONS_PATH, json=body) - assert resp.status_code == 200 - assert resp.json()["choices"][0]["message"]["content"] == "hi back" - - # Upstream saw the same body - assert spy.received_bodies == [body] - assert spy.received_paths == [CHAT_COMPLETIONS_PATH] - - # Trace dir has exactly one request + response pair - req_files = sorted(capture_dir.glob("*.request.json")) - resp_files = sorted(capture_dir.glob("*.response.json")) - assert len(req_files) == 1 - assert len(resp_files) == 1 - assert req_files[0].stem.split(".")[0] == resp_files[0].stem.split(".")[0] - - req_record = json.loads(req_files[0].read_text()) - assert req_record["body"] == body - assert "request_id" in req_record - assert isinstance(req_record["captured_at"], int) - - resp_record = json.loads(resp_files[0].read_text()) - assert resp_record["stream"] is False - assert resp_record["status_code"] == 200 - assert resp_record["body"]["choices"][0]["message"]["content"] == "hi back" - - -def test_chat_streaming_forwards_chunks_and_captures_raw( - spy: UpstreamSpy, capture_dir: Path, proxy_client: TestClient -): - sse_chunks = [ - b'data: {"choices":[{"delta":{"role":"assistant"}}]}\n\n', - b'data: {"choices":[{"delta":{"content":"hi"}}]}\n\n', - b'data: {"choices":[{"delta":{"content":" back"}}]}\n\n', - b"data: [DONE]\n\n", - ] - - async def responder(request): - async def gen(): - for c in sse_chunks: - yield c - - return StreamingResponse(gen(), media_type="text/event-stream") - - spy.set_responder(responder) - - body = { - "model": "test-model", - "messages": [{"role": "user", "content": "hi"}], - "stream": True, - } - with proxy_client.stream("POST", CHAT_COMPLETIONS_PATH, json=body) as resp: - assert resp.status_code == 200 - received = b"".join(resp.iter_bytes()) - - # The agent-side client got the bytes the upstream produced - assert received == b"".join(sse_chunks) - - # The capture's response.json captured the assembled stream + status - resp_files = list(capture_dir.glob("*.response.json")) - assert len(resp_files) == 1 - record = json.loads(resp_files[0].read_text()) - assert record["stream"] is True - assert record["status_code"] == 200 - assert record["raw"] == b"".join(sse_chunks).decode("utf-8") - - -def test_passthrough_models_endpoint_is_not_captured( - spy: UpstreamSpy, capture_dir: Path, proxy_client: TestClient -): - resp = proxy_client.get("/v1/models") - assert resp.status_code == 200 - assert resp.json() == { - "object": "list", - "data": [{"id": "mock-model", "object": "model"}], - } - # Upstream saw the call - assert "/v1/models" in spy.received_paths - # But nothing was written to the capture dir - assert list(capture_dir.iterdir()) == [] - - -def test_arbitrary_passthrough_path( - spy: UpstreamSpy, capture_dir: Path, proxy_client: TestClient -): - resp = proxy_client.get("/unrelated/path?x=1") - assert resp.status_code == 200 - body = resp.json() - assert body["path"] == "/unrelated/path" - assert body["method"] == "GET" - # Trace dir untouched - assert list(capture_dir.iterdir()) == [] - - -def test_two_requests_get_distinct_request_ids( - spy: UpstreamSpy, capture_dir: Path, proxy_client: TestClient -): - async def responder(request): - return JSONResponse({"id": "x", "choices": []}) - - spy.set_responder(responder) - - body = {"model": "m", "messages": [{"role": "user", "content": "."}]} - proxy_client.post(CHAT_COMPLETIONS_PATH, json=body) - proxy_client.post(CHAT_COMPLETIONS_PATH, json=body) - - req_files = sorted(capture_dir.glob("*.request.json")) - assert len(req_files) == 2 - ids = {json.loads(p.read_text())["request_id"] for p in req_files} - assert len(ids) == 2 # distinct - - -def test_malformed_request_body_still_captured( - spy: UpstreamSpy, capture_dir: Path, proxy_client: TestClient -): - async def responder(request): - return JSONResponse({"choices": []}) - - spy.set_responder(responder) - - raw = b"{not json" - resp = proxy_client.post( - CHAT_COMPLETIONS_PATH, content=raw, headers={"content-type": "application/json"} - ) - # Upstream still got the bytes verbatim — we don't sanitise input. - # Whether upstream accepts it is upstream's problem; we just relay. - assert resp.status_code == 200 - - req_files = list(capture_dir.glob("*.request.json")) - assert len(req_files) == 1 - record = json.loads(req_files[0].read_text()) - # Body is preserved as a placeholder dict instead of crashing - assert record["body"] == {"_unparsed_raw": "{not json"} - - -def test_upstream_500_is_forwarded_and_captured( - spy: UpstreamSpy, capture_dir: Path, proxy_client: TestClient -): - async def responder(request): - return JSONResponse({"error": {"message": "boom"}}, status_code=500) - - spy.set_responder(responder) - - body = {"model": "m", "messages": [{"role": "user", "content": "x"}]} - resp = proxy_client.post(CHAT_COMPLETIONS_PATH, json=body) - assert resp.status_code == 500 - assert resp.json()["error"]["message"] == "boom" - - resp_files = list(capture_dir.glob("*.response.json")) - assert len(resp_files) == 1 - record = json.loads(resp_files[0].read_text()) - assert record["status_code"] == 500 - assert record["body"]["error"]["message"] == "boom" - - -def test_request_id_is_consistent_across_request_and_response_files( - spy: UpstreamSpy, capture_dir: Path, proxy_client: TestClient -): - async def responder(request): - return JSONResponse({"choices": []}) - - spy.set_responder(responder) - - proxy_client.post( - CHAT_COMPLETIONS_PATH, - json={"model": "m", "messages": [{"role": "user", "content": "."}]}, - ) - req_files = list(capture_dir.glob("*.request.json")) - resp_files = list(capture_dir.glob("*.response.json")) - assert len(req_files) == 1 - assert len(resp_files) == 1 - rid_from_req = json.loads(req_files[0].read_text())["request_id"] - rid_from_resp = json.loads(resp_files[0].read_text())["request_id"] - assert rid_from_req == rid_from_resp - # Filenames also share the prefix - assert req_files[0].name.startswith(rid_from_req) - assert resp_files[0].name.startswith(rid_from_req) - - -def test_capture_dir_is_created_if_missing(tmp_path: Path, spy: UpstreamSpy): - """make_app should create the capture dir on init.""" - target = tmp_path / "does" / "not" / "exist" - upstream_app = _build_upstream(spy) - upstream_transport = httpx.ASGITransport(app=upstream_app) - upstream_client = httpx.AsyncClient( - transport=upstream_transport, base_url="http://upstream" - ) - make_app("http://upstream", target, client=upstream_client) - assert target.is_dir() diff --git a/tests/test_proxy_http.py b/tests/test_proxy_http.py deleted file mode 100644 index 4bb23be..0000000 --- a/tests/test_proxy_http.py +++ /dev/null @@ -1,276 +0,0 @@ -"""Integration tests for the capture proxy — real HTTP over TCP loopback. - -Two uvicorn servers run in worker threads: - - mock upstream (Starlette app on a free port) - - capture proxy (Starlette app on another free port, pointed at upstream) - -The test client makes real ``httpx.Client`` HTTP calls to the proxy. -This catches wiring issues that the in-process ASGITransport unit -tests in ``test_proxy.py`` would not — header reconstruction, content -encoding, streaming-chunk pump-through, etc. - -Marked as ``integration`` so they can be filtered out with -``pytest -m 'not integration'`` when iterating on logic. -""" - -from __future__ import annotations - -import json -import socket -import threading -import time -from pathlib import Path - -import httpx -import pytest -import uvicorn -from starlette.applications import Starlette -from starlette.requests import Request -from starlette.responses import JSONResponse, Response, StreamingResponse -from starlette.routing import Route - -from agentcap.proxy import CHAT_COMPLETIONS_PATH, make_app - - -pytestmark = pytest.mark.integration - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - - -def _free_port() -> int: - s = socket.socket() - s.bind(("127.0.0.1", 0)) - port = s.getsockname()[1] - s.close() - return port - - -class UvicornThreadServer: - """Run a uvicorn server in a daemon thread and shut it down cleanly.""" - - def __init__(self, app, host: str = "127.0.0.1", port: int | None = None): - self.host = host - self.port = port or _free_port() - config = uvicorn.Config( - app, - host=host, - port=self.port, - log_level="error", - lifespan="on", - ) - self.server = uvicorn.Server(config) - # Disable signal handler installation — we're not the main thread. - self.server.install_signal_handlers = lambda *_: None - self._thread: threading.Thread | None = None - - @property - def url(self) -> str: - return f"http://{self.host}:{self.port}" - - def start(self, timeout: float = 5.0) -> None: - self._thread = threading.Thread(target=self.server.run, daemon=True) - self._thread.start() - deadline = time.monotonic() + timeout - # Poll until the server is accepting connections — server.started - # flips to True once uvicorn's serve() has bound the socket. - while time.monotonic() < deadline: - if self.server.started: - # Smoke-check the socket is actually accepting. - try: - with socket.create_connection((self.host, self.port), timeout=0.2): - return - except OSError: - pass - time.sleep(0.05) - raise RuntimeError(f"uvicorn server on :{self.port} failed to start in {timeout}s") - - def stop(self, timeout: float = 5.0) -> None: - self.server.should_exit = True - if self._thread is not None: - self._thread.join(timeout=timeout) - if self._thread.is_alive(): - # Best-effort: force-exit. uvicorn's force_exit triggers - # the loop to exit immediately on the next iteration. - self.server.force_exit = True - self._thread.join(timeout=timeout) - - -# --------------------------------------------------------------------------- -# Mock upstream -# --------------------------------------------------------------------------- - - -class UpstreamSpy: - def __init__(self) -> None: - self.received_bodies: list[dict] = [] - self.received_paths: list[str] = [] - self.responder = None - - def set_responder(self, fn) -> None: - self.responder = fn - - -def _build_upstream(spy: UpstreamSpy) -> Starlette: - async def chat_handler(request: Request) -> Response: - body = await request.body() - try: - spy.received_bodies.append(json.loads(body)) - except json.JSONDecodeError: - spy.received_bodies.append({"_raw": body.decode("utf-8", errors="replace")}) - spy.received_paths.append(request.url.path) - if spy.responder is None: - return JSONResponse({"error": "no responder"}, status_code=500) - return await spy.responder(request) - - async def models_handler(request: Request) -> Response: - spy.received_paths.append(request.url.path) - return JSONResponse( - {"object": "list", "data": [{"id": "real-mock", "object": "model"}]} - ) - - return Starlette( - routes=[ - Route(CHAT_COMPLETIONS_PATH, chat_handler, methods=["POST"]), - Route("/v1/models", models_handler, methods=["GET"]), - ] - ) - - -# --------------------------------------------------------------------------- -# Fixtures — two real uvicorn servers + a clean capture dir per test -# --------------------------------------------------------------------------- - - -@pytest.fixture -def spy() -> UpstreamSpy: - return UpstreamSpy() - - -@pytest.fixture -def capture_dir(tmp_path: Path) -> Path: - d = tmp_path / "capture" - d.mkdir() - return d - - -@pytest.fixture -def upstream(spy: UpstreamSpy): - server = UvicornThreadServer(_build_upstream(spy)) - server.start() - yield server - server.stop() - - -@pytest.fixture -def proxy(upstream: UvicornThreadServer, capture_dir: Path): - # Real proxy → real upstream URL. No client injection: the proxy - # creates its own httpx.AsyncClient and dials over TCP loopback. - proxy_app = make_app(upstream.url, capture_dir) - server = UvicornThreadServer(proxy_app) - server.start() - yield server - server.stop() - - -# --------------------------------------------------------------------------- -# Tests -# --------------------------------------------------------------------------- - - -def test_chat_nonstreaming_over_real_http( - spy: UpstreamSpy, capture_dir: Path, proxy: UvicornThreadServer -): - async def responder(request): - return JSONResponse( - { - "id": "chatcmpl-http-1", - "choices": [ - {"index": 0, "message": {"role": "assistant", "content": "ok"}} - ], - } - ) - - spy.set_responder(responder) - body = { - "model": "test-model", - "messages": [{"role": "user", "content": "ping"}], - "stream": False, - } - with httpx.Client(timeout=10.0) as client: - resp = client.post(f"{proxy.url}{CHAT_COMPLETIONS_PATH}", json=body) - assert resp.status_code == 200 - assert resp.json()["choices"][0]["message"]["content"] == "ok" - - # Upstream saw the body verbatim - assert spy.received_bodies == [body] - - # Trace dir got both files - req_files = list(capture_dir.glob("*.request.json")) - resp_files = list(capture_dir.glob("*.response.json")) - assert len(req_files) == 1 and len(resp_files) == 1 - assert json.loads(req_files[0].read_text())["body"] == body - assert ( - json.loads(resp_files[0].read_text())["body"]["choices"][0]["message"]["content"] - == "ok" - ) - - -def test_chat_streaming_over_real_http( - spy: UpstreamSpy, capture_dir: Path, proxy: UvicornThreadServer -): - sse_chunks = [ - b'data: {"choices":[{"delta":{"role":"assistant"}}]}\n\n', - b'data: {"choices":[{"delta":{"content":"hello"}}]}\n\n', - b'data: {"choices":[{"delta":{"content":" world"}}]}\n\n', - b"data: [DONE]\n\n", - ] - - async def responder(request): - async def gen(): - for c in sse_chunks: - yield c - - return StreamingResponse(gen(), media_type="text/event-stream") - - spy.set_responder(responder) - body = { - "model": "test-model", - "messages": [{"role": "user", "content": "stream me"}], - "stream": True, - } - received = bytearray() - with httpx.Client(timeout=10.0) as client: - with client.stream( - "POST", f"{proxy.url}{CHAT_COMPLETIONS_PATH}", json=body - ) as resp: - assert resp.status_code == 200 - for chunk in resp.iter_bytes(): - received.extend(chunk) - - expected = b"".join(sse_chunks) - assert bytes(received) == expected - - resp_files = list(capture_dir.glob("*.response.json")) - assert len(resp_files) == 1 - record = json.loads(resp_files[0].read_text()) - assert record["stream"] is True - assert record["status_code"] == 200 - assert record["raw"] == expected.decode("utf-8") - - -def test_passthrough_over_real_http_does_not_capture( - spy: UpstreamSpy, capture_dir: Path, proxy: UvicornThreadServer -): - with httpx.Client(timeout=10.0) as client: - resp = client.get(f"{proxy.url}/v1/models") - assert resp.status_code == 200 - assert resp.json() == { - "object": "list", - "data": [{"id": "real-mock", "object": "model"}], - } - # Upstream got the call, capture dir untouched - assert "/v1/models" in spy.received_paths - assert list(capture_dir.iterdir()) == [] diff --git a/tests/test_proxy_meta.py b/tests/test_proxy_meta.py deleted file mode 100644 index cede408..0000000 --- a/tests/test_proxy_meta.py +++ /dev/null @@ -1,250 +0,0 @@ -"""Tests for the proxy's per-request stamping: ``upstream_url`` on -captured requests and ``upstream_fingerprint`` on captured responses. - -The proxy keeps no metadata file, no startup probe, no drift state — -those concerns moved to the export layer, derived from the per-row -stamps tested here. - -Uses the same in-process ASGI wiring as ``test_proxy.py`` — proxy -talks to a Starlette mock upstream through an ``httpx.AsyncClient`` -backed by ``ASGITransport``. -""" - -from __future__ import annotations - -import json -from pathlib import Path - -import httpx -import pytest -from starlette.applications import Starlette -from starlette.requests import Request -from starlette.responses import JSONResponse, Response, StreamingResponse -from starlette.routing import Route -from starlette.testclient import TestClient - -from agentcap.proxy import CHAT_COMPLETIONS_PATH, make_app - - -class UpstreamSpy: - def __init__(self) -> None: - self.responder = None - - def set_responder(self, fn) -> None: - self.responder = fn - - -def _build_upstream(spy: UpstreamSpy) -> Starlette: - async def chat_handler(request: Request) -> Response: - if spy.responder is None: - return JSONResponse({"error": "no responder"}, status_code=500) - return await spy.responder(request) - - return Starlette( - routes=[Route(CHAT_COMPLETIONS_PATH, chat_handler, methods=["POST"])] - ) - - -@pytest.fixture -def capture_dir(tmp_path: Path) -> Path: - d = tmp_path / "capture" - d.mkdir() - return d - - -@pytest.fixture -def spy() -> UpstreamSpy: - return UpstreamSpy() - - -@pytest.fixture -def proxy_app(spy: UpstreamSpy, capture_dir: Path): - upstream_transport = httpx.ASGITransport(app=_build_upstream(spy)) - upstream_client = httpx.AsyncClient( - transport=upstream_transport, base_url="http://upstream" - ) - return make_app("http://upstream", capture_dir, client=upstream_client) - - -# --------------------------------------------------------------------------- -# Per-request: upstream_url stamping -# --------------------------------------------------------------------------- - - -def test_request_stamps_upstream_url( - spy: UpstreamSpy, capture_dir: Path, proxy_app -): - """Every ``.request.json`` carries the URL the proxy was forwarding - to. Export derives the provider classification from this stamp - alone — no sidecar metadata file involved.""" - async def responder(request): - return JSONResponse({"id": "x", "model": "m", "choices": []}) - - spy.set_responder(responder) - with TestClient(proxy_app) as client: - client.post( - CHAT_COMPLETIONS_PATH, - json={"model": "m", "messages": [{"role": "user", "content": "."}]}, - ) - - req_files = list(capture_dir.glob("*.request.json")) - assert len(req_files) == 1 - rec = json.loads(req_files[0].read_text()) - assert rec["upstream_url"] == "http://upstream" - - -def test_no_metadata_file_written( - spy: UpstreamSpy, capture_dir: Path, proxy_app -): - """The capture dir contains only per-request capture files — no - ``_proxy.json``, no ``_meta.json``, nothing else.""" - async def responder(request): - return JSONResponse({"id": "x", "model": "m", "choices": []}) - - spy.set_responder(responder) - with TestClient(proxy_app) as client: - client.post( - CHAT_COMPLETIONS_PATH, - json={"model": "m", "messages": [{"role": "user", "content": "."}]}, - ) - - names = sorted(p.name for p in capture_dir.iterdir()) - # One .request.json + one .response.json, nothing else. - assert len(names) == 2 - assert all( - n.endswith(".request.json") or n.endswith(".response.json") - for n in names - ) - - -# --------------------------------------------------------------------------- -# Per-response: upstream_fingerprint stamping -# --------------------------------------------------------------------------- - - -def test_response_fingerprint_extracted_from_upstream_headers( - spy: UpstreamSpy, capture_dir: Path, proxy_app -): - async def responder(request): - return JSONResponse( - { - "id": "x", - "model": "qwen-actually-served", - "choices": [{"index": 0, "message": {"role": "assistant", "content": "hi"}}], - }, - headers={"server": "llama.cpp", "x-served-by": "fireworks-pod-7"}, - ) - - spy.set_responder(responder) - with TestClient(proxy_app) as client: - resp = client.post( - CHAT_COMPLETIONS_PATH, - json={"model": "m", "messages": [{"role": "user", "content": "."}]}, - ) - assert resp.status_code == 200 - - resp_files = list(capture_dir.glob("*.response.json")) - assert len(resp_files) == 1 - rec = json.loads(resp_files[0].read_text()) - fp = rec["upstream_fingerprint"] - assert fp["server"] == "llama.cpp" - assert fp["x_served_by"] == "fireworks-pod-7" - assert fp["served_model"] == "qwen-actually-served" - assert fp["build_info"] is None # not echoed on this response - - -def test_streaming_response_fingerprint_picks_model_from_first_chunk( - spy: UpstreamSpy, capture_dir: Path, proxy_app -): - """For SSE responses the body isn't a single dict; extract ``model`` - from the first parseable ``data:`` payload.""" - sse_chunks = [ - b'data: {"id":"x","model":"qwen-served","choices":[{"delta":{"role":"assistant"}}]}\n\n', - b'data: {"id":"x","model":"qwen-served","choices":[{"delta":{"content":"hi"}}]}\n\n', - b"data: [DONE]\n\n", - ] - - async def responder(request): - async def gen(): - for c in sse_chunks: - yield c - return StreamingResponse( - gen(), - media_type="text/event-stream", - headers={"server": "llama.cpp"}, - ) - - spy.set_responder(responder) - with TestClient(proxy_app) as client: - with client.stream( - "POST", - CHAT_COMPLETIONS_PATH, - json={"model": "m", "stream": True, "messages": [{"role": "user", "content": "."}]}, - ) as resp: - for _ in resp.iter_bytes(): - pass - - rec = json.loads(next(capture_dir.glob("*.response.json")).read_text()) - assert rec["stream"] is True - assert rec["upstream_fingerprint"]["served_model"] == "qwen-served" - assert rec["upstream_fingerprint"]["server"] == "llama.cpp" - - -# --------------------------------------------------------------------------- -# Export-side provider derivation from the per-request stamp -# --------------------------------------------------------------------------- - - -def test_detect_provider_columns_derives_from_request_stamp(tmp_path: Path): - from agentcap.export import detect_provider_columns - - capture = tmp_path / "t" - capture.mkdir() - (capture / "rid.request.json").write_text(json.dumps({ - "request_id": "rid", - "captured_at": 1, - "upstream_url": "https://router.huggingface.co", - "body": {"model": "meta-llama/Llama-3.3-70B:fireworks-ai", "messages": []}, - })) - cols = detect_provider_columns(capture) - assert cols["upstream_url"] == "https://router.huggingface.co" - assert cols["provider"] == "hf-router/fireworks-ai" - - -def test_detect_provider_columns_local_upstream(tmp_path: Path): - from agentcap.export import detect_provider_columns - - capture = tmp_path / "t" - capture.mkdir() - (capture / "rid.request.json").write_text(json.dumps({ - "request_id": "rid", - "captured_at": 1, - "upstream_url": "http://127.0.0.1:8000", - "body": {"model": "qwen-test", "messages": []}, - })) - cols = detect_provider_columns(capture) - assert cols["provider"] == "local" - - -def test_detect_provider_columns_empty_for_legacy_capture(tmp_path: Path): - """Trace dir from before the proxy started stamping upstream_url — - no way to derive the column, return empty so the parquet schema - just omits it.""" - from agentcap.export import detect_provider_columns - - capture = tmp_path / "t" - capture.mkdir() - (capture / "rid.request.json").write_text(json.dumps({ - "request_id": "rid", - "captured_at": 1, - "body": {"model": "m", "messages": []}, - })) - assert detect_provider_columns(capture) == {} - - -def test_detect_provider_columns_no_requests(tmp_path: Path): - from agentcap.export import detect_provider_columns - - capture = tmp_path / "t" - capture.mkdir() - assert detect_provider_columns(capture) == {} diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py deleted file mode 100644 index a0f70ae..0000000 --- a/tests/test_sandbox.py +++ /dev/null @@ -1,21 +0,0 @@ -"""Tests for the sandbox abstraction.""" - -from __future__ import annotations - -import pytest - -from agentcap.sandbox import get_sandbox -from agentcap.sandbox.podman import PodmanSandbox - - -def test_get_sandbox_returns_podman_sandbox(): - """The factory hands back a ``PodmanSandbox`` keyed on the - canonical per-agent image ref.""" - sb = get_sandbox(agent="goose") - assert isinstance(sb, PodmanSandbox) - assert sb.image == "localhost/agentcap-goose:latest" - - -def test_get_sandbox_requires_agent(): - with pytest.raises(TypeError): - get_sandbox() # type: ignore[call-arg] diff --git a/tests/test_scan.py b/tests/test_scan.py deleted file mode 100644 index da896e3..0000000 --- a/tests/test_scan.py +++ /dev/null @@ -1,251 +0,0 @@ -"""Unit tests for ``agentcap.scan``. - -These exercise the real trufflehog binary (skipped when not -installed). Cache behaviour is checked by calling ``scan_run_dir`` -twice and inspecting the returned ``was_cached`` flag + the -persisted ``scan.json`` — no monkeypatching of ``scan_path`` itself, -so the tests fail if the cache short-circuit regresses. - -Missing-binary errors are exercised by manipulating ``PATH`` / -``HOME`` (legitimate inputs to ``find_trufflehog``), not by -mocking ``shutil.which``. -""" - -from __future__ import annotations - -import json -import os -import shutil -from pathlib import Path - -import pytest - -from agentcap.scan import ( - SCAN_CACHE_NAME, - TrufflehogMissingError, - find_trufflehog, - load_cached_scan, - scan_path, - scan_run_dir, -) - - -def _has_trufflehog() -> bool: - if shutil.which("trufflehog"): - return True - local = Path.home() / ".local" / "bin" / "trufflehog" - return local.is_file() and os.access(local, os.X_OK) - - -_HAS_TRUFFLEHOG = _has_trufflehog() - - -live = pytest.mark.skipif( - not _HAS_TRUFFLEHOG, reason="trufflehog binary not installed" -) - - -def _make_run_dir(root: Path, *, with_poisoned_capture: bool = False) -> Path: - """Minimal run layout with only ``captures/`` populated, so each - test controls exactly which subdir trufflehog has to scan.""" - run_dir = root / "agent-local-20260601-000000" - captures = run_dir / "captures" - captures.mkdir(parents=True) - body = '{"request_id":"rid","captured_at":1,"body":{"model":"m","messages":[]}}' - (captures / "rid.request.json").write_text(body) - if with_poisoned_capture: - # Stripe doc-example key — pattern-matches the Stripe detector, - # never verifies against the live API. - (captures / "poisoned.request.json").write_text( - '{"messages":[{"role":"user","content":"sk_live_4eC39HqLyjWDarjtT1zdp7dc"}]}' - ) - return run_dir - - -# --------------------------------------------------------------------------- -# find_trufflehog — PATH / HOME-driven, no mocking -# --------------------------------------------------------------------------- - - -def test_find_trufflehog_raises_when_path_and_home_are_empty( - monkeypatch, tmp_path, -): - """Both PATH lookup and ~/.local/bin fallback miss → install hint.""" - monkeypatch.setenv("PATH", str(tmp_path)) # nothing in this dir - monkeypatch.setenv("HOME", str(tmp_path)) # no .local/bin/trufflehog under HOME - with pytest.raises(TrufflehogMissingError) as exc: - find_trufflehog() - assert "install.sh" in str(exc.value) - - -def test_find_trufflehog_falls_back_to_local_bin(monkeypatch, tmp_path): - """No PATH hit → ~/.local/bin/trufflehog wins.""" - fake = tmp_path / ".local" / "bin" / "trufflehog" - fake.parent.mkdir(parents=True) - fake.write_text("#!/bin/sh\n") - fake.chmod(0o755) - monkeypatch.setenv("PATH", str(tmp_path)) # empty - monkeypatch.setenv("HOME", str(tmp_path)) - assert find_trufflehog() == str(fake) - - -@live -def test_find_trufflehog_finds_installed_binary(): - """The real installed binary is locatable.""" - bin_path = find_trufflehog() - assert os.path.basename(bin_path) == "trufflehog" - assert os.access(bin_path, os.X_OK) - - -# --------------------------------------------------------------------------- -# scan_path — runs the real binary -# --------------------------------------------------------------------------- - - -@live -def test_scan_path_clean_dir_no_hits(tmp_path): - (tmp_path / "f.json").write_text('{"model": "m", "messages": []}\n') - result = scan_path(tmp_path, no_verification=True) - assert result.verified == [] - assert result.unverified == [] - assert result.chunks_scanned >= 1 - - -@live -def test_scan_path_detects_unverified_stripe_pattern(tmp_path): - """A canned Stripe-shaped string trips the Stripe detector. - With ``no_verification=True`` Stripe's docs example lands as - unverified — we don't call the live API.""" - (tmp_path / "poisoned.json").write_text( - '{"messages":[{"role":"user","content":"sk_live_4eC39HqLyjWDarjtT1zdp7dc"}]}' - ) - result = scan_path(tmp_path, no_verification=True) - assert result.verified == [] - assert len(result.unverified) >= 1 - assert result.unverified[0].detector.lower() == "stripe" - assert result.unverified[0].file.endswith("poisoned.json") - - -# --------------------------------------------------------------------------- -# scan_run_dir — cache write + reuse + mode mismatch (real binary) -# --------------------------------------------------------------------------- - - -@live -def test_scan_run_dir_writes_cache_on_first_call(tmp_path): - run_dir = _make_run_dir(tmp_path) - result, was_cached = scan_run_dir(run_dir, no_verification=True) - assert was_cached is False - cache_path = run_dir / SCAN_CACHE_NAME - assert cache_path.is_file(), "scan.json should be written on first call" - cache = json.loads(cache_path.read_text()) - assert cache["no_verification"] is True - assert cache["chunks_scanned"] == result.chunks_scanned - assert cache["bytes_scanned"] == result.bytes_scanned - - -@live -def test_scan_run_dir_reuses_cache_on_second_call(tmp_path): - """Second call in the same mode short-circuits — no trufflehog - invocation. Verified by stat-ing the cache mtime: a fresh scan - would rewrite it.""" - run_dir = _make_run_dir(tmp_path) - scan_run_dir(run_dir, no_verification=True) - mtime_after_first = (run_dir / SCAN_CACHE_NAME).stat().st_mtime - - _, was_cached = scan_run_dir(run_dir, no_verification=True) - assert was_cached is True - assert (run_dir / SCAN_CACHE_NAME).stat().st_mtime == mtime_after_first - - -@live -def test_scan_run_dir_rescan_forces_fresh_subprocess(tmp_path): - run_dir = _make_run_dir(tmp_path) - scan_run_dir(run_dir, no_verification=True) - - # mtime resolution on linux can be coarse — overwrite the cache - # with a sentinel so a successful re-scan is observable by the - # cache content changing back to a real ScanResult. - (run_dir / SCAN_CACHE_NAME).write_text("{}") - - _, was_cached = scan_run_dir(run_dir, no_verification=True, rescan=True) - assert was_cached is False - # Cache rewritten with a real ScanResult, not the sentinel ``{}``. - refreshed = json.loads((run_dir / SCAN_CACHE_NAME).read_text()) - assert "chunks_scanned" in refreshed - assert refreshed["chunks_scanned"] >= 1 - - -@live -def test_scan_run_dir_rescans_when_verify_request_meets_pattern_only_cache( - tmp_path, -): - """Pattern-only cache can't satisfy a verified request — re-scan.""" - run_dir = _make_run_dir(tmp_path) - scan_run_dir(run_dir, no_verification=True) # pattern-only cache - _, was_cached = scan_run_dir(run_dir, no_verification=False) - assert was_cached is False - # Cache file now records the new (verified) mode. - cache = json.loads((run_dir / SCAN_CACHE_NAME).read_text()) - assert cache["no_verification"] is False - - -@live -def test_scan_run_dir_verified_cache_satisfies_pattern_request(tmp_path): - """Verified cache covers a pattern-only request (patterns are a - subset of what verification ran on).""" - run_dir = _make_run_dir(tmp_path) - scan_run_dir(run_dir, no_verification=False) # verified cache - _, was_cached = scan_run_dir(run_dir, no_verification=True) - assert was_cached is True - - -@live -def test_scan_run_dir_finds_hits_in_captures(tmp_path): - run_dir = _make_run_dir(tmp_path, with_poisoned_capture=True) - result, _ = scan_run_dir(run_dir, no_verification=True) - assert result.verified == [] - assert any(h.detector.lower() == "stripe" for h in result.unverified) - - -@live -def test_scan_run_dir_excludes_self_cache(tmp_path): - """scan.json itself must never be in the scan corpus — that would - inflate chunk counts on every re-scan.""" - run_dir = _make_run_dir(tmp_path) - first, _ = scan_run_dir(run_dir, no_verification=True) - second, _ = scan_run_dir(run_dir, no_verification=True, rescan=True) - assert first.chunks_scanned == second.chunks_scanned - assert first.bytes_scanned == second.bytes_scanned - - -# --------------------------------------------------------------------------- -# load_cached_scan — pure file IO, no binary needed -# --------------------------------------------------------------------------- - - -def test_load_cached_scan_returns_none_when_missing(tmp_path): - assert load_cached_scan(tmp_path, no_verification=False) is None - - -def test_load_cached_scan_returns_none_on_invalid_json(tmp_path): - (tmp_path / SCAN_CACHE_NAME).write_text("not json at all") - assert load_cached_scan(tmp_path, no_verification=False) is None - - -def test_load_cached_scan_rejects_pattern_only_when_verify_requested( - tmp_path, -): - (tmp_path / SCAN_CACHE_NAME).write_text(json.dumps({ - "scanned_at": 0, - "no_verification": True, - "bytes_scanned": 0, - "chunks_scanned": 0, - "verified": [], - "unverified": [], - })) - # Pattern-only cache; caller wants verified → cache is not enough. - assert load_cached_scan(tmp_path, no_verification=False) is None - # Same cache satisfies a pattern-only request. - result = load_cached_scan(tmp_path, no_verification=True) - assert result is not None From e8e28ae983a73bc4624d30300c048339221be029 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Thu, 25 Jun 2026 15:31:12 +0000 Subject: [PATCH 2/2] chore: drop Python references from doc comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the Python package gone, remove the "Ports `*.py`" provenance and the "matches the Python" notes from the Rust module docs — comments should describe the code as it stands. Kept the references to the external pyarrow / `datasets` ecosystem (consumer-side parquet compatibility), which remain accurate. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/captures.rs | 3 +-- src/diff.rs | 6 ++---- src/drivers/goose.rs | 4 ++-- src/drivers/hermes.rs | 4 ++-- src/drivers/mod.rs | 2 +- src/drivers/opencode.rs | 2 +- src/drivers/pi.rs | 2 +- src/export.rs | 2 +- src/followups/mod.rs | 2 +- src/followups/synthesized.rs | 2 +- src/hub/footer.rs | 4 ++-- src/inspect/mod.rs | 4 ++-- src/inspect/render.rs | 9 ++++----- src/inspect/sources.rs | 6 ++---- src/ls.rs | 2 +- src/model.rs | 6 +++--- src/orchestrator.rs | 4 ++-- src/parquet_io.rs | 3 +-- src/proxy/capture.rs | 3 +-- src/proxy/mod.rs | 2 +- src/query.rs | 2 +- src/run.rs | 2 +- src/sandbox/mod.rs | 4 ++-- src/sandbox/provisioning.rs | 6 +++--- src/scan.rs | 4 ++-- src/sse.rs | 4 ++-- tests/cross_impl.rs | 2 +- tests/live.rs | 2 +- 28 files changed, 45 insertions(+), 53 deletions(-) diff --git a/src/captures.rs b/src/captures.rs index 037fe60..53191ac 100644 --- a/src/captures.rs +++ b/src/captures.rs @@ -1,8 +1,7 @@ //! Resolve a captured request by id and hand back the body. //! //! No normalization of the JSON — captures persist the request as parsed JSON, -//! so the original byte sequence isn't recoverable, but the object is. Ports -//! `captures.py` (`load_request(s)`, `resolve_workspace_rid`). +//! so the original byte sequence isn't recoverable, but the object is. use std::collections::{HashMap, HashSet}; use std::path::{Path, PathBuf}; diff --git a/src/diff.rs b/src/diff.rs index 85cd4b8..aa0269b 100644 --- a/src/diff.rs +++ b/src/diff.rs @@ -1,6 +1,4 @@ -//! Message diffing + one-line summaries for the inspect picker. Ports -//! `_message_key` / `_diff_messages` / `_delta_label` / `_message_text` / -//! `_message_summary` / `_flatten` from the Python. +//! Message diffing + one-line summaries for the inspect picker. use crate::model::canonical_json; use serde_json::Value; @@ -37,7 +35,7 @@ pub fn message_key(m: &Value) -> MessageKey { } } -/// Python truthiness for the values we test: non-empty arrays/strings/objects, +/// Truthiness for the values we test: non-empty arrays/strings/objects, /// non-zero numbers, `true`. `null`/`false`/empty are falsy. fn is_truthy(v: &Value) -> bool { match v { diff --git a/src/drivers/goose.rs b/src/drivers/goose.rs index 44d50f5..8f4fd5b 100644 --- a/src/drivers/goose.rs +++ b/src/drivers/goose.rs @@ -1,5 +1,5 @@ -//! Goose driver: `goose run -t ""`. Ports `drivers/goose.py`. The proxy -//! URL + provider are baked into the image ENV; the driver sets `GOOSE_MODEL`. +//! Goose driver: `goose run -t ""`. The proxy URL + provider are baked +//! into the image ENV; the driver sets `GOOSE_MODEL`. use std::collections::BTreeMap; use std::sync::Arc; diff --git a/src/drivers/hermes.rs b/src/drivers/hermes.rs index 8c0b7ae..69d3ce7 100644 --- a/src/drivers/hermes.rs +++ b/src/drivers/hermes.rs @@ -1,5 +1,5 @@ -//! Hermes driver: `hermes chat -q ""` non-interactively. Ports -//! `drivers/hermes.py`. The proxy URL + config are baked into the image. +//! Hermes driver: `hermes chat -q ""` non-interactively. The proxy URL +//! + config are baked into the image. use std::collections::BTreeMap; use std::sync::{Arc, OnceLock}; diff --git a/src/drivers/mod.rs b/src/drivers/mod.rs index a94bdcf..50f6d40 100644 --- a/src/drivers/mod.rs +++ b/src/drivers/mod.rs @@ -1,4 +1,4 @@ -//! Agent driver adapters. Ports `drivers/__init__.py` + the four agent modules. +//! Agent driver adapters. //! //! A driver wraps an agent CLI so the orchestrator can `start` a session, //! `resume` it, and read back the final response text. Drivers shell out via the diff --git a/src/drivers/opencode.rs b/src/drivers/opencode.rs index 3812b61..e211d93 100644 --- a/src/drivers/opencode.rs +++ b/src/drivers/opencode.rs @@ -1,4 +1,4 @@ -//! OpenCode driver: `opencode run --format json`. Ports `drivers/opencode.py`. +//! OpenCode driver: `opencode run --format json`. //! OpenCode emits NDJSON events on stdout: `text` events carry assistant chunks, //! every event has `sessionID`, `tool_use` events carry error states. diff --git a/src/drivers/pi.rs b/src/drivers/pi.rs index 1a90c46..a160569 100644 --- a/src/drivers/pi.rs +++ b/src/drivers/pi.rs @@ -1,4 +1,4 @@ -//! pi-mono driver: `pi -p "" --provider local`. Ports `drivers/pi.py`. +//! pi-mono driver: `pi -p "" --provider local`. //! pi mints its own session UUID on start and resumes the latest via `--continue`. use std::collections::BTreeMap; diff --git a/src/export.rs b/src/export.rs index b117f65..a31f509 100644 --- a/src/export.rs +++ b/src/export.rs @@ -1,6 +1,6 @@ //! `export`: render capture dirs to parquet and push to the Hub. //! -//! Ports `export.py` + the `export_cmd` orchestration in `__main__.py`. Three +//! Three //! artifacts per push: `/-captures` (parquet), one //! `/--traces` per agent (raw native traces), and a //! Collection titled `` grouping them. The trufflehog gate (verified hits diff --git a/src/followups/mod.rs b/src/followups/mod.rs index deef85d..9576eb9 100644 --- a/src/followups/mod.rs +++ b/src/followups/mod.rs @@ -1,4 +1,4 @@ -//! Follow-up strategies for multi-turn runs. Ports `followups/*`. +//! Follow-up strategies for multi-turn runs. //! //! `turn` is the 1-indexed number of the upcoming turn (first follow-up is //! `turn=2`). `continue` is cheapest; `templates` rotates a small pool; diff --git a/src/followups/synthesized.rs b/src/followups/synthesized.rs index 67e1eaf..3684533 100644 --- a/src/followups/synthesized.rs +++ b/src/followups/synthesized.rs @@ -1,5 +1,5 @@ //! Synthesized follow-up: sends `(original_task, last_response)` to a model and -//! uses the reply as the next user message. Ports `followups/synthesized.py`. +//! uses the reply as the next user message. //! //! By design this call **bypasses the capture proxy** — it talks to the model //! server directly so the captured corpus stays a clean record of agent↔model diff --git a/src/hub/footer.rs b/src/hub/footer.rs index 762f5d3..1cc436e 100644 --- a/src/hub/footer.rs +++ b/src/hub/footer.rs @@ -3,8 +3,8 @@ //! //! Our writer stamps `agent`/`model`/`tasks` into the parquet key-value //! metadata, so a footer read surfaces the full preview slice. (Parquets written -//! by the Python pyarrow path keep those under the embedded `ARROW:schema` blob -//! instead; those show only the row count here until selected.) +//! by pyarrow keep those under the embedded `ARROW:schema` blob instead; those +//! show only the row count here until selected.) use anyhow::{bail, Result}; use parquet::errors::ParquetError; diff --git a/src/inspect/mod.rs b/src/inspect/mod.rs index 21b22ef..8d0730d 100644 --- a/src/inspect/mod.rs +++ b/src/inspect/mod.rs @@ -1,5 +1,5 @@ //! `inspect`: classify the TARGET and launch the picker (or dump a request body -//! for a bare hex rid). Ports `_classify_target` + `inspect_cmd`. +//! for a bare hex rid). mod app; mod render; @@ -111,7 +111,7 @@ fn dump_rid(rid: &str, rid_flag: bool) -> Result<()> { Ok(()) } -/// Classify the TARGET positional. Ports `_classify_target`. +/// Classify the TARGET positional. fn classify_target(target: Option<&str>) -> Result { let Some(target) = target else { return Ok(Target::Workspace(std::env::current_dir()?.join(WORKSPACE_DIR))); diff --git a/src/inspect/render.rs b/src/inspect/render.rs index 9aa3941..a869df2 100644 --- a/src/inspect/render.rs +++ b/src/inspect/render.rs @@ -1,7 +1,6 @@ //! Preview-pane rendering for inspect: build `ratatui::Text` for a run, a //! request (header + prompt + message diff), a flattened message, and an HF -//! parquet entry. Query terms are highlighted (bold red), replacing the Python -//! `_highlight` ANSI pipeline. +//! parquet entry. Query terms are highlighted (bold red). use ratatui::style::{Color, Modifier, Style}; use ratatui::text::{Line, Span, Text}; @@ -13,7 +12,7 @@ use super::sources::{MsgRecord, ReqRow, RunRow}; const ARGS_CAP: usize = 240; -/// Run metadata preview (run picker). Ports `_run_preview_cmd`. +/// Run metadata preview (run picker). pub fn run_preview(run: &RunRow) -> Text<'static> { let meta: Value = std::fs::read(run.run_dir.join("run.json")) .ok() @@ -75,7 +74,7 @@ pub fn hf_parquet_preview( Text::from(lines) } -/// Request preview: header + initial prompt + message diff. Ports `_preview_cmd`. +/// Request preview: header + initial prompt + message diff. pub fn request_preview(row: &ReqRow, body: &Value, prev_body: Option<&Value>, terms: &[String]) -> Text<'static> { let messages = body .get("messages") @@ -157,7 +156,7 @@ pub fn request_preview(row: &ReqRow, body: &Value, prev_body: Option<&Value>, te Text::from(lines) } -/// One flattened message detail (message picker). Ports `_render_msg_preview`. +/// One flattened message detail (message picker). pub fn message_preview(rec: &MsgRecord, terms: &[String]) -> Text<'static> { let mut lines = vec![plain(format!("role: {}", rec.role))]; match rec.msg_idx { diff --git a/src/inspect/sources.rs b/src/inspect/sources.rs index 6782005..0fc1d16 100644 --- a/src/inspect/sources.rs +++ b/src/inspect/sources.rs @@ -1,8 +1,6 @@ //! Data layer for inspect: enumerate runs / requests / messages from a //! workspace or a parquet, and load request/response bodies for the preview and -//! message levels. Ports `_enumerate_workspace_requests`, -//! `_enumerate_parquet_requests`, `_request_messages_for_view`, and the -//! body/response loaders from `__main__.py`. +//! message levels. use std::collections::HashMap; use std::path::{Path, PathBuf}; @@ -311,7 +309,7 @@ fn value_to_string(v: &Value) -> String { } /// Flatten request `messages[]` + the decoded response into one record per -/// picker row. Ports `_request_messages_for_view`. +/// picker row. pub fn request_messages_for_view(body: &Value, resp: Option<&Value>) -> Vec { let mut records = Vec::new(); let msgs = body diff --git a/src/ls.rs b/src/ls.rs index 452b0da..2f53b40 100644 --- a/src/ls.rs +++ b/src/ls.rs @@ -1,4 +1,4 @@ -//! `ls`: list runs under a local workspace. Ports `ls_cmd`. +//! `ls`: list runs under a local workspace. //! //! Unlike `export`, `ls` does NOT consult `$AGENTCAP_WORKSPACE` — what you point //! it at is what you get. Accepts either the parent dir or the `.agentcap/` dir. diff --git a/src/model.rs b/src/model.rs index a97c6ae..e8548d5 100644 --- a/src/model.rs +++ b/src/model.rs @@ -16,9 +16,9 @@ pub struct DecodedResponse { pub finish_reason: Option, } -/// Canonical JSON string with object keys sorted recursively — the Rust analog -/// of Python's `json.dumps(obj, sort_keys=True)`. Used to make heterogeneous -/// sub-objects (message content arrays, tool_calls) comparable for diffing. +/// Canonical JSON string with object keys sorted recursively. Used to make +/// heterogeneous sub-objects (message content arrays, tool_calls) comparable +/// for diffing. pub fn canonical_json(v: &Value) -> String { let mut out = String::new(); write_canonical(v, &mut out); diff --git a/src/orchestrator.rs b/src/orchestrator.rs index 09d6804..aafeb63 100644 --- a/src/orchestrator.rs +++ b/src/orchestrator.rs @@ -1,5 +1,5 @@ -//! Drive an agent driver through a corpus with a follow-up strategy. Ports -//! `orchestrator.py`. Proxy-agnostic: the caller wires capture context via the +//! Drive an agent driver through a corpus with a follow-up strategy. +//! Proxy-agnostic: the caller wires capture context via the //! `set_ctx` callback (the proxy stamps it onto each capture). use std::path::Path; diff --git a/src/parquet_io.rs b/src/parquet_io.rs index 87efec8..1fde237 100644 --- a/src/parquet_io.rs +++ b/src/parquet_io.rs @@ -1,7 +1,6 @@ //! Capture dir → parquet (export) and parquet → request bodies (read). //! -//! Ports `export.py`'s `export_local` / `_iter_pairs` / `_row` and the parquet -//! readers in `captures.py`. The `request` / `response` columns are +//! The `request` / `response` columns are //! JSON-stringified bodies (Arrow can't infer a schema over heterogeneous //! tool-schema fields); `agent` / `model` / `tasks` are stamped into the //! parquet key-value metadata so the inspect picker can label files cheaply. diff --git a/src/proxy/capture.rs b/src/proxy/capture.rs index 2567fd4..1ba50ee 100644 --- a/src/proxy/capture.rs +++ b/src/proxy/capture.rs @@ -1,7 +1,6 @@ //! Capture record shapes + persistence. Writes `.request.json` / //! `.response.json` in the exact shape the data/UI half reads (see -//! `parquet_io` / `captures`). Ports the `_persist_*` / fingerprint / SSE-model -//! helpers from `proxy.py`. +//! `parquet_io` / `captures`). use std::io; use std::path::Path; diff --git a/src/proxy/mod.rs b/src/proxy/mod.rs index 811508e..190d7a4 100644 --- a/src/proxy/mod.rs +++ b/src/proxy/mod.rs @@ -1,4 +1,4 @@ -//! Synchronous capture proxy. Ports `proxy.py`. +//! Synchronous capture proxy. //! //! A `tiny_http` server on a worker-thread pool fronts an OpenAI-compatible //! upstream. `POST /v1/chat/completions` is captured to diff --git a/src/query.rs b/src/query.rs index 1d222c9..a13e430 100644 --- a/src/query.rs +++ b/src/query.rs @@ -1,5 +1,5 @@ //! fzf-style query parsing → the literal substrings to highlight in the preview -//! pane. Ports `_parse_fzf_terms`. nucleo handles the actual matching with the +//! pane. nucleo handles the actual matching with the //! same operator atoms; this only extracts what to colour. /// Split a query into the literal text of each non-negated term, with the diff --git a/src/run.rs b/src/run.rs index 03bc048..c4febfb 100644 --- a/src/run.rs +++ b/src/run.rs @@ -1,5 +1,5 @@ //! The `run` command: drive an agent CLI through a corpus, capture every -//! chat-completion, and summarise. Ports `run_cmd` from `__main__.py`. +//! chat-completion, and summarise. use std::collections::BTreeMap; use std::path::{Path, PathBuf}; diff --git a/src/sandbox/mod.rs b/src/sandbox/mod.rs index 49a33b1..cdfae56 100644 --- a/src/sandbox/mod.rs +++ b/src/sandbox/mod.rs @@ -1,4 +1,4 @@ -//! Podman container sandbox. Ports `sandbox/podman.py` + `sandbox/__init__.py`. +//! Podman container sandbox. //! //! Each [`PodmanSandbox::run`] is a fresh `podman run --rm` against a pre-built //! per-agent image. Host paths in `writable_paths` / `readonly_paths` are @@ -178,7 +178,7 @@ fn run_child(wrapped: &[String], timeout: Option) -> std::result::Resu } /// Provision (build the image if needed) and return a sandbox, or an error with -/// an install hint. Ports `require_sandbox_or_die`. +/// an install hint. pub fn require_sandbox( agent: &str, env: BTreeMap, diff --git a/src/sandbox/provisioning.rs b/src/sandbox/provisioning.rs index f01d766..121d15a 100644 --- a/src/sandbox/provisioning.rs +++ b/src/sandbox/provisioning.rs @@ -1,9 +1,9 @@ -//! Per-agent podman image lifecycle. Ports `sandbox/podman_provisioning.py`. +//! Per-agent podman image lifecycle. //! //! The Containerfile is the source of truth: its SHA256 (plus any sibling //! context dir) is baked into the built image as a label; a mismatch on a later -//! run forces a rebuild. The hash algorithm matches the Python byte-for-byte so -//! Rust and Python agree and don't trigger needless rebuilds. +//! run forces a rebuild. The hash format is fixed so images built by earlier +//! agentcap versions are reused on upgrade, not needlessly rebuilt. use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; diff --git a/src/scan.rs b/src/scan.rs index 6fefd9b..cedb761 100644 --- a/src/scan.rs +++ b/src/scan.rs @@ -1,7 +1,7 @@ //! Secret scan over a capture run, gating `export`. //! -//! Shells out to `trufflehog filesystem` and parses its JSON. Policy (matching -//! the Python `scan.py`): a single **verified** hit aborts the export; +//! Shells out to `trufflehog filesystem` and parses its JSON. Policy: a single +//! **verified** hit aborts the export; //! **unverified** hits are reported but non-blocking (pattern matchers have a //! real false-positive rate). Results are cached to `/scan.json`; the //! cache is invalidated by `rescan` or when a pattern-only cache can't satisfy a diff --git a/src/sse.rs b/src/sse.rs index 0884a7f..d063321 100644 --- a/src/sse.rs +++ b/src/sse.rs @@ -1,5 +1,5 @@ //! Decode OpenAI-compatible responses into a single synthesized assistant -//! message. Ports `_decode_sse_response` / `_decode_response` from the Python. +//! message. use crate::model::DecodedResponse; use serde_json::{json, Value}; @@ -120,7 +120,7 @@ mod tests { use serde_json::json; /// Assemble an SSE blob: one `data: ` line per object + trailing - /// `[DONE]`, matching the Python test helper. + /// `[DONE]`. fn sse(objs: &[Value]) -> String { let mut s: String = objs .iter() diff --git a/tests/cross_impl.rs b/tests/cross_impl.rs index 6a530dc..c2268f0 100644 --- a/tests/cross_impl.rs +++ b/tests/cross_impl.rs @@ -1,5 +1,5 @@ //! Opt-in cross-implementation check: write a parquet with the Rust exporter to -//! `$AGENTCAP_PARQUET_OUT`, so a Python/pyarrow reader can confirm the schema, +//! `$AGENTCAP_PARQUET_OUT`, so a pyarrow reader can confirm the schema, //! KV metadata, and row JSON load cleanly. Ignored by default (needs the env //! var + leaves the file in place): //! diff --git a/tests/live.rs b/tests/live.rs index 1622b49..c965830 100644 --- a/tests/live.rs +++ b/tests/live.rs @@ -1,7 +1,7 @@ //! Live end-to-end tests: drive the real `agentcap run` binary through a real //! OpenAI-compatible server for each agent, asserting the wire path (the agent //! reaches the model through the proxy and the turn completes) — not task -//! quality. Ports `test_cli_live.py` + `test_drivers_live.py`. +//! quality. //! //! `#[ignore]` by default so `cargo test` stays hermetic. The `Test - Live` //! workflow provisions a llama.cpp server + builds the per-agent images, then