From 77e7e76a7f88ae9994260db27b14fb7099113b03 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Thu, 25 Jun 2026 08:34:18 +0000
Subject: [PATCH 1/5] test: add live e2e tests + Test - Live workflow (port of
 pytest -m live)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prerequisite to removing the Python client: replace the Python live tier
(linux-live-tests.yml, test_cli_live.py, test_drivers_live.py) with a Rust port.

- tests/live.rs: per-agent end-to-end tests that run the real `agentcap run`
  binary (via CARGO_BIN_EXE) against a live OpenAI-compatible server, asserting
  the wire path — run.json shape, completed_turns, captures landed, and pi's
  streamed JSONL trace. Subsumes both Python live files (CLI e2e + per-driver).
  `#[ignore]`d so `cargo test` stays hermetic; each test skips (passes) when no
  server is reachable. opencode omitted (same reason as the Python skip).
- .github/workflows/live.yml ("Test - Live"): ports the proven Python live
  setup — install podman, cache + download the Qwen3-1.7B GGUF, cache the
  rootless image store, spawn the pinned llama.cpp server, then
  `cargo test --test live -- --ignored` (serial). Sandbox images build on
  demand via the binary.

Gated by AGENTCAP_TEST_LLM_URL (else a :8000/:8080 probe). Verified locally:
fmt/clippy green, full suite hermetic (live ignored), live tests skip-pass with
no server. The live workflow itself needs podman + GGUF, so it's validated in CI.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/live.yml | 107 ++++++++++++++++++++++++++
 tests/live.rs              | 149 +++++++++++++++++++++++++++++++++++++
 2 files changed, 256 insertions(+)
 create mode 100644 .github/workflows/live.yml
 create mode 100644 tests/live.rs

diff --git a/.github/workflows/live.yml b/.github/workflows/live.yml
new file mode 100644
index 0000000..7023677
--- /dev/null
+++ b/.github/workflows/live.yml
@@ -0,0 +1,107 @@
+name: Test - Live
+
+# Full agent×model end-to-end: spin a real llama.cpp server, build the per-agent
+# sandbox images on demand, and drive `agentcap run` against the server for each
+# agent (pi/hermes/goose). Heavy (GGUF download + image builds + CPU inference) —
+# this is agentcap's "live" tier, the Rust port of `linux-live-tests.yml`.
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+    paths:
+      - "src/**"
+      - "tests/**"
+      - "containers/**"
+      - "Cargo.toml"
+      - "Cargo.lock"
+      - ".github/workflows/live.yml"
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+env:
+  CARGO_TERM_COLOR: always
+  RUST_BACKTRACE: "1"
+  # Pinned to match tests/conftest.py (the proven Python live setup).
+  GGUF_REPO: Qwen/Qwen3-1.7B-GGUF
+  GGUF_FILE: Qwen3-1.7B-Q8_0.gguf
+  LLAMA_IMAGE: ghcr.io/ggml-org/llama.cpp:server-b9487
+  LLAMA_PORT: "8080"
+
+jobs:
+  live:
+    name: Live (agent × model)
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    steps:
+      - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
+
+      - name: Install podman
+        run: sudo apt-get update && sudo apt-get install -y podman
+
+      - uses: dtolnay/rust-toolchain@e081816240890017053eacbb1bdf337761dc5582 # 1.95.0
+
+      - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2
+        with:
+          shared-key: live
+
+      # GGUF weights (~1.8 GB). Cache so only the first run downloads.
+      - name: Cache GGUF
+        uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        with:
+          path: ~/models
+          key: gguf-${{ env.GGUF_FILE }}-v1
+
+      - name: Download GGUF (on cache miss)
+        run: |
+          mkdir -p "$HOME/models"
+          if [ ! -f "$HOME/models/${GGUF_FILE}" ]; then
+            curl -sSfL -o "$HOME/models/${GGUF_FILE}" \
+              "https://huggingface.co/${GGUF_REPO}/resolve/main/${GGUF_FILE}"
+          fi
+
+      # Per-agent sandbox images are built on demand by `agentcap run`; cache the
+      # rootless container store so `ensure_image` short-circuits on hash match.
+      - name: Cache sandbox images
+        uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        with:
+          path: ~/.local/share/containers
+          key: sandbox-images-${{ hashFiles('containers/**') }}
+          restore-keys: |
+            sandbox-images-
+
+      - name: Start llama.cpp server
+        run: |
+          podman run --rm -d --name agentcap-llama \
+            -p "127.0.0.1:${LLAMA_PORT}:8080" \
+            --mount "type=bind,src=$HOME/models,dst=/models,ro" \
+            "${LLAMA_IMAGE}" \
+            --model "/models/${GGUF_FILE}" \
+            --host 0.0.0.0 --port 8080 \
+            --ctx-size 8192 --reasoning-format none --jinja
+          echo "Waiting for llama serve…"
+          for i in $(seq 1 180); do
+            if curl -sf "http://127.0.0.1:${LLAMA_PORT}/v1/models" >/dev/null; then
+              echo "ready after ${i}s"; break
+            fi
+            sleep 1
+          done
+          curl -sf "http://127.0.0.1:${LLAMA_PORT}/v1/models" >/dev/null \
+            || { echo "llama serve never became ready"; podman logs agentcap-llama | tail -50; exit 1; }
+          echo "AGENTCAP_TEST_LLM_URL=http://127.0.0.1:${LLAMA_PORT}" >> "$GITHUB_ENV"
+
+      # Serial: the agents share one CPU-bound server. Builds each per-agent image
+      # on first use (cached thereafter).
+      - name: Live tests
+        run: cargo test --test live -- --ignored --nocapture --test-threads=1
+
+      - name: Stop llama.cpp server
+        if: always()
+        run: podman rm -f agentcap-llama || true
diff --git a/tests/live.rs b/tests/live.rs
new file mode 100644
index 0000000..501db6a
--- /dev/null
+++ b/tests/live.rs
@@ -0,0 +1,149 @@
+//! Live end-to-end tests: drive the real `agentcap run` binary through a real
+//! OpenAI-compatible server for each agent, asserting the wire path (the agent
+//! reaches the model through the proxy and the turn completes) — not task
+//! quality. Ports `test_cli_live.py` + `test_drivers_live.py`.
+//!
+//! `#[ignore]` by default so `cargo test` stays hermetic. The `Test - Live`
+//! workflow provisions a llama.cpp server + builds the per-agent images, then
+//! runs `cargo test --test live -- --ignored`. Run locally with a server up:
+//!   AGENTCAP_TEST_LLM_URL=http://127.0.0.1:8000 cargo test --test live -- --ignored
+//! Each test skips (passes) if no server is reachable.
+
+use std::process::Command;
+use std::time::Duration;
+
+use serde_json::{json, Value};
+
+/// Resolve a reachable llama upstream: `$AGENTCAP_TEST_LLM_URL`, else a server
+/// already on :8000/:8080. `None` → no server, skip.
+fn upstream() -> Option<String> {
+    if let Ok(u) = std::env::var("AGENTCAP_TEST_LLM_URL") {
+        if !u.trim().is_empty() {
+            return Some(u.trim().trim_end_matches('/').to_string());
+        }
+    }
+    let client = reqwest::blocking::Client::builder()
+        .timeout(Duration::from_secs(2))
+        .build()
+        .ok()?;
+    for port in [8000, 8080] {
+        let url = format!("http://127.0.0.1:{port}");
+        if client
+            .get(format!("{url}/v1/models"))
+            .send()
+            .map(|r| r.status().is_success())
+            .unwrap_or(false)
+        {
+            return Some(url);
+        }
+    }
+    None
+}
+
+/// `agentcap run --agent <agent>` against the live server; assert the run dir,
+/// run.json shape, captures, and (for pi) the streamed JSONL trace.
+fn run_agent(agent: &str, expect_jsonl_traces: bool) {
+    let Some(upstream) = upstream() else {
+        eprintln!("skip live[{agent}]: no llama server (set AGENTCAP_TEST_LLM_URL or run one on :8000/:8080)");
+        return;
+    };
+    let model = std::env::var("AGENTCAP_TEST_MODEL").unwrap_or_else(|_| "Qwen3-1.7B".to_string());
+
+    let tmp = tempfile::tempdir().unwrap();
+    let ws = tmp.path().join("ws");
+    std::fs::create_dir(&ws).unwrap();
+    let tasks = tmp.path().join("tasks.txt");
+    std::fs::write(&tasks, "Say hello in one short sentence, then stop.\n").unwrap();
+
+    let out = Command::new(env!("CARGO_BIN_EXE_agentcap"))
+        .args([
+            "run",
+            "--agent",
+            agent,
+            "--model",
+            &model,
+            "--upstream",
+            &upstream,
+            "--tasks",
+            tasks.to_str().unwrap(),
+            "--turns",
+            "1",
+            "--timeout",
+            "600",
+        ])
+        .env("AGENTCAP_WORKSPACE", &ws)
+        .output()
+        .expect("spawn agentcap");
+    assert!(
+        out.status.success(),
+        "agentcap run --agent {agent} failed (exit {:?})\n--- stderr ---\n{}",
+        out.status.code(),
+        String::from_utf8_lossy(&out.stderr)
+    );
+
+    // Exactly one run dir for this agent.
+    let run_dir = std::fs::read_dir(ws.join(".agentcap"))
+        .unwrap()
+        .filter_map(|e| e.ok().map(|e| e.path()))
+        .find(|p| {
+            p.file_name()
+                .and_then(|n| n.to_str())
+                .is_some_and(|n| n.starts_with(&format!("{agent}-")))
+        })
+        .expect("a run dir under .agentcap");
+
+    let summary: Value = serde_json::from_slice(&std::fs::read(run_dir.join("run.json")).unwrap()).unwrap();
+    assert_eq!(summary["agent"], json!(agent));
+    assert_eq!(summary["model"], json!(model));
+    assert_eq!(summary["upstream"], json!(upstream));
+    assert_eq!(summary["turns_per_task"], json!(1));
+    let task = &summary["tasks"][0];
+    assert_eq!(
+        task["completed_turns"],
+        json!(1),
+        "wire path: agent didn't complete the turn"
+    );
+    assert!(
+        task["session_id"].as_str().is_some_and(|s| !s.is_empty()),
+        "{agent} should mint a session id; run.json: {summary}"
+    );
+
+    // Captures landed via the in-process proxy.
+    let n_caps = std::fs::read_dir(run_dir.join("captures"))
+        .unwrap()
+        .filter_map(|e| e.ok())
+        .filter(|e| e.file_name().to_str().is_some_and(|n| n.ends_with(".request.json")))
+        .count();
+    assert!(n_caps > 0, "proxy should have captured at least one request");
+
+    if expect_jsonl_traces {
+        let has_jsonl = std::fs::read_dir(run_dir.join("traces"))
+            .unwrap()
+            .filter_map(|e| e.ok())
+            .any(|e| e.path().extension().is_some_and(|x| x == "jsonl"));
+        assert!(has_jsonl, "{agent} should have streamed at least one .jsonl trace");
+    }
+}
+
+#[test]
+#[ignore = "live: needs a model server + podman"]
+fn live_pi() {
+    // pi streams native session JSONL through the in-container symlink.
+    run_agent("pi", true);
+}
+
+#[test]
+#[ignore = "live: needs a model server + podman"]
+fn live_hermes() {
+    run_agent("hermes", false);
+}
+
+#[test]
+#[ignore = "live: needs a model server + podman"]
+fn live_goose() {
+    run_agent("goose", false);
+}
+
+// opencode is intentionally omitted: opencode 1.15.x doesn't pick up the baked
+// `agent.minimal` from the per-agent image (fails "agent minimal not found"),
+// matching the `@pytest.mark.skip` on `test_opencode_live`.

From e7ac69c3d613bc0fb3280edc9ec34030b965f540 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Thu, 25 Jun 2026 09:05:43 +0000
Subject: [PATCH 2/5] test(live): add failure diagnostics; scope CI to pi while
 debugging

---
 .github/workflows/live.yml |  4 +++-
 tests/live.rs              | 32 +++++++++++++++++++++++++++++++-
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/live.yml b/.github/workflows/live.yml
index 7023677..9cb8ed0 100644
--- a/.github/workflows/live.yml
+++ b/.github/workflows/live.yml
@@ -100,7 +100,9 @@ jobs:
       # Serial: the agents share one CPU-bound server. Builds each per-agent image
       # on first use (cached thereafter).
       - name: Live tests
-        run: cargo test --test live -- --ignored --nocapture --test-threads=1
+        # TODO: scoped to pi while diagnosing the first live run; restore to the
+        # full set (`cargo test --test live`) once green.
+        run: cargo test --test live live_pi -- --ignored --nocapture --test-threads=1
 
       - name: Stop llama.cpp server
         if: always()
diff --git a/tests/live.rs b/tests/live.rs
index 501db6a..63cca75 100644
--- a/tests/live.rs
+++ b/tests/live.rs
@@ -40,6 +40,35 @@ fn upstream() -> Option<String> {
     None
 }
 
+/// Last `n` chars of `s`, for failure dumps.
+fn tail(s: &str, n: usize) -> String {
+    let start = s.char_indices().rev().take(n).last().map(|(i, _)| i).unwrap_or(0);
+    s[start..].to_string()
+}
+
+/// Dump everything useful when a turn doesn't complete: run.json, the agentcap
+/// binary's stderr (orchestrator `[turn_done] rc=…` / `[task_aborted] reason=…`),
+/// and each per-turn session log (the agent's own stdout/stderr).
+fn diagnostics(run_dir: &std::path::Path, summary: &Value, bin_stderr: &[u8]) -> String {
+    let mut out = format!("--- run.json ---\n{summary:#}\n");
+    out.push_str(&format!(
+        "--- agentcap stderr (tail) ---\n{}\n",
+        tail(&String::from_utf8_lossy(bin_stderr), 4000)
+    ));
+    if let Ok(rd) = std::fs::read_dir(run_dir.join("sessions")) {
+        for e in rd.flatten() {
+            let p = e.path();
+            let body = std::fs::read_to_string(&p).unwrap_or_default();
+            out.push_str(&format!(
+                "--- sessions/{} (tail) ---\n{}\n",
+                p.file_name().unwrap().to_string_lossy(),
+                tail(&body, 2000)
+            ));
+        }
+    }
+    out
+}
+
 /// `agentcap run --agent <agent>` against the live server; assert the run dir,
 /// run.json shape, captures, and (for pi) the streamed JSONL trace.
 fn run_agent(agent: &str, expect_jsonl_traces: bool) {
@@ -101,7 +130,8 @@ fn run_agent(agent: &str, expect_jsonl_traces: bool) {
     assert_eq!(
         task["completed_turns"],
         json!(1),
-        "wire path: agent didn't complete the turn"
+        "wire path: agent didn't complete the turn\n{}",
+        diagnostics(&run_dir, &summary, &out.stderr)
     );
     assert!(
         task["session_id"].as_str().is_some_and(|s| !s.is_empty()),

From 5fcb2a7e0bc18d692ddbf2140148a5e61a18286b Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Thu, 25 Jun 2026 09:26:57 +0000
Subject: [PATCH 3/5] fix(proxy): raise the per-request timeout off reqwest's
 30s blocking default

reqwest::blocking defaults to a 30s total-request timeout; a slow streamed
generation (e.g. an agent turn on a CPU runner) blows past it, so the proxy's
upstream read errors mid-stream and the agent's turn never completes. Cap at a
generous-but-finite 900s instead (synth follow-up: 300s). Restores the live
workflow to all agents.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/live.yml   | 4 +---
 src/followups/synthesized.rs | 9 ++++++++-
 src/proxy/mod.rs             | 8 ++++++--
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/live.yml b/.github/workflows/live.yml
index 9cb8ed0..7023677 100644
--- a/.github/workflows/live.yml
+++ b/.github/workflows/live.yml
@@ -100,9 +100,7 @@ jobs:
       # Serial: the agents share one CPU-bound server. Builds each per-agent image
       # on first use (cached thereafter).
       - name: Live tests
-        # TODO: scoped to pi while diagnosing the first live run; restore to the
-        # full set (`cargo test --test live`) once green.
-        run: cargo test --test live live_pi -- --ignored --nocapture --test-threads=1
+        run: cargo test --test live -- --ignored --nocapture --test-threads=1
 
       - name: Stop llama.cpp server
         if: always()
diff --git a/src/followups/synthesized.rs b/src/followups/synthesized.rs
index 3d80b00..67e1eaf 100644
--- a/src/followups/synthesized.rs
+++ b/src/followups/synthesized.rs
@@ -5,6 +5,8 @@
 //! server directly so the captured corpus stays a clean record of agent↔model
 //! interaction. Any failure falls back to `"continue"` (logged).
 
+use std::time::Duration;
+
 use reqwest::blocking::Client;
 use serde_json::{json, Value};
 
@@ -44,7 +46,12 @@ impl SynthesizedFollowUp {
             model: model.to_string(),
             api_key: api_key.map(str::to_string),
             fallback: "continue".to_string(),
-            client: Client::new(),
+            // Generous cap (blocking's 30s default is too short for a slow synth);
+            // falls back to "continue" if it still times out.
+            client: Client::builder()
+                .timeout(Duration::from_secs(300))
+                .build()
+                .unwrap_or_default(),
         }
     }
 
diff --git a/src/proxy/mod.rs b/src/proxy/mod.rs
index fb1a7e3..5785d24 100644
--- a/src/proxy/mod.rs
+++ b/src/proxy/mod.rs
@@ -66,8 +66,12 @@ struct CaptureProxy {
 impl CaptureProxy {
     fn new(upstream: &str, capture_dir: PathBuf) -> Result<Self> {
         std::fs::create_dir_all(&capture_dir).with_context(|| format!("creating {}", capture_dir.display()))?;
-        // No timeout: agent calls can be long; the agent decides when to give up.
-        let client = Client::builder().build().context("building HTTP client")?;
+        // Generous per-request cap: blocking's 30s default truncates slow streamed
+        // generations; finite so a hung upstream can't wedge a worker forever.
+        let client = Client::builder()
+            .timeout(Duration::from_secs(900))
+            .build()
+            .context("building HTTP client")?;
         Ok(CaptureProxy {
             upstream: upstream.trim_end_matches('/').to_string(),
             capture_dir,

From acb4caadeca997d67183e610f594086a9785ce2d Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Thu, 25 Jun 2026 09:41:17 +0000
Subject: [PATCH 4/5] test(live): drop hermes from CLI live tests (needs
 prompt-shrink flags not on `run`)

hermes' base prompt exceeds the tiny CI model's budget and bails before any
model call; the Python suite only ran hermes at the driver level with
ignore_rules/toolsets, which `agentcap run` doesn't expose. Keep pi + goose,
which cover the full stack across both trace mechanisms.
---
 tests/live.rs | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/tests/live.rs b/tests/live.rs
index 63cca75..f55ccb6 100644
--- a/tests/live.rs
+++ b/tests/live.rs
@@ -162,18 +162,20 @@ fn live_pi() {
     run_agent("pi", true);
 }
 
-#[test]
-#[ignore = "live: needs a model server + podman"]
-fn live_hermes() {
-    run_agent("hermes", false);
-}
-
 #[test]
 #[ignore = "live: needs a model server + podman"]
 fn live_goose() {
     run_agent("goose", false);
 }
 
-// opencode is intentionally omitted: opencode 1.15.x doesn't pick up the baked
-// `agent.minimal` from the per-agent image (fails "agent minimal not found"),
-// matching the `@pytest.mark.skip` on `test_opencode_live`.
+// hermes and opencode are intentionally omitted — neither runs via `agentcap run`
+// on the tiny CI model:
+//   - hermes: its base system prompt (~3.9k tokens) exceeds the budget on
+//     Qwen3-1.7B, so it bails before any model call. The Python suite never ran
+//     hermes through the CLI either — `test_hermes_live` drove the driver directly
+//     with prompt-shrinking flags (`ignore_rules`, `toolsets="file"`) that `run`
+//     doesn't expose. hermes stdout parsing is covered by unit tests.
+//   - opencode: 1.15.x doesn't pick up the baked `agent.minimal` from the image
+//     (matching the `@pytest.mark.skip` on `test_opencode_live`).
+// pi (symlink/JSONL traces) + goose (dump-traces/SQLite) cover the full stack
+// across both trace-surfacing mechanisms.

From 22dbc3929260a14e3180f5853c13fe2d5307391e Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Thu, 25 Jun 2026 10:13:24 +0000
Subject: [PATCH 5/5] fix(proxy): lower per-request timeout cap 900s -> 300s

---
 src/proxy/mod.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/proxy/mod.rs b/src/proxy/mod.rs
index 5785d24..811508e 100644
--- a/src/proxy/mod.rs
+++ b/src/proxy/mod.rs
@@ -66,10 +66,10 @@ struct CaptureProxy {
 impl CaptureProxy {
     fn new(upstream: &str, capture_dir: PathBuf) -> Result<Self> {
         std::fs::create_dir_all(&capture_dir).with_context(|| format!("creating {}", capture_dir.display()))?;
-        // Generous per-request cap: blocking's 30s default truncates slow streamed
-        // generations; finite so a hung upstream can't wedge a worker forever.
+        // Per-request cap above blocking's 30s default (too short for a slow streamed
+        // generation), but finite so a hung upstream can't wedge a worker.
         let client = Client::builder()
-            .timeout(Duration::from_secs(900))
+            .timeout(Duration::from_secs(300))
             .build()
             .context("building HTTP client")?;
         Ok(CaptureProxy {