From 77e7e76a7f88ae9994260db27b14fb7099113b03 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Thu, 25 Jun 2026 08:34:18 +0000 Subject: [PATCH 1/5] test: add live e2e tests + Test - Live workflow (port of pytest -m live) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prerequisite to removing the Python client: replace the Python live tier (linux-live-tests.yml, test_cli_live.py, test_drivers_live.py) with a Rust port. - tests/live.rs: per-agent end-to-end tests that run the real `agentcap run` binary (via CARGO_BIN_EXE) against a live OpenAI-compatible server, asserting the wire path — run.json shape, completed_turns, captures landed, and pi's streamed JSONL trace. Subsumes both Python live files (CLI e2e + per-driver). `#[ignore]`d so `cargo test` stays hermetic; each test skips (passes) when no server is reachable. opencode omitted (same reason as the Python skip). - .github/workflows/live.yml ("Test - Live"): ports the proven Python live setup — install podman, cache + download the Qwen3-1.7B GGUF, cache the rootless image store, spawn the pinned llama.cpp server, then `cargo test --test live -- --ignored` (serial). Sandbox images build on demand via the binary. Gated by AGENTCAP_TEST_LLM_URL (else a :8000/:8080 probe). Verified locally: fmt/clippy green, full suite hermetic (live ignored), live tests skip-pass with no server. The live workflow itself needs podman + GGUF, so it's validated in CI. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/live.yml | 107 ++++++++++++++++++++++++++ tests/live.rs | 149 +++++++++++++++++++++++++++++++++++++ 2 files changed, 256 insertions(+) create mode 100644 .github/workflows/live.yml create mode 100644 tests/live.rs diff --git a/.github/workflows/live.yml b/.github/workflows/live.yml new file mode 100644 index 0000000..7023677 --- /dev/null +++ b/.github/workflows/live.yml @@ -0,0 +1,107 @@ +name: Test - Live + +# Full agent×model end-to-end: spin a real llama.cpp server, build the per-agent +# sandbox images on demand, and drive `agentcap run` against the server for each +# agent (pi/hermes/goose). Heavy (GGUF download + image builds + CPU inference) — +# this is agentcap's "live" tier, the Rust port of `linux-live-tests.yml`. + +on: + push: + branches: [main] + pull_request: + branches: [main] + paths: + - "src/**" + - "tests/**" + - "containers/**" + - "Cargo.toml" + - "Cargo.lock" + - ".github/workflows/live.yml" + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read + +env: + CARGO_TERM_COLOR: always + RUST_BACKTRACE: "1" + # Pinned to match tests/conftest.py (the proven Python live setup). + GGUF_REPO: Qwen/Qwen3-1.7B-GGUF + GGUF_FILE: Qwen3-1.7B-Q8_0.gguf + LLAMA_IMAGE: ghcr.io/ggml-org/llama.cpp:server-b9487 + LLAMA_PORT: "8080" + +jobs: + live: + name: Live (agent × model) + runs-on: ubuntu-latest + timeout-minutes: 60 + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + + - name: Install podman + run: sudo apt-get update && sudo apt-get install -y podman + + - uses: dtolnay/rust-toolchain@e081816240890017053eacbb1bdf337761dc5582 # 1.95.0 + + - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 + with: + shared-key: live + + # GGUF weights (~1.8 GB). Cache so only the first run downloads. + - name: Cache GGUF + uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 + with: + path: ~/models + key: gguf-${{ env.GGUF_FILE }}-v1 + + - name: Download GGUF (on cache miss) + run: | + mkdir -p "$HOME/models" + if [ ! -f "$HOME/models/${GGUF_FILE}" ]; then + curl -sSfL -o "$HOME/models/${GGUF_FILE}" \ + "https://huggingface.co/${GGUF_REPO}/resolve/main/${GGUF_FILE}" + fi + + # Per-agent sandbox images are built on demand by `agentcap run`; cache the + # rootless container store so `ensure_image` short-circuits on hash match. + - name: Cache sandbox images + uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 + with: + path: ~/.local/share/containers + key: sandbox-images-${{ hashFiles('containers/**') }} + restore-keys: | + sandbox-images- + + - name: Start llama.cpp server + run: | + podman run --rm -d --name agentcap-llama \ + -p "127.0.0.1:${LLAMA_PORT}:8080" \ + --mount "type=bind,src=$HOME/models,dst=/models,ro" \ + "${LLAMA_IMAGE}" \ + --model "/models/${GGUF_FILE}" \ + --host 0.0.0.0 --port 8080 \ + --ctx-size 8192 --reasoning-format none --jinja + echo "Waiting for llama serve…" + for i in $(seq 1 180); do + if curl -sf "http://127.0.0.1:${LLAMA_PORT}/v1/models" >/dev/null; then + echo "ready after ${i}s"; break + fi + sleep 1 + done + curl -sf "http://127.0.0.1:${LLAMA_PORT}/v1/models" >/dev/null \ + || { echo "llama serve never became ready"; podman logs agentcap-llama | tail -50; exit 1; } + echo "AGENTCAP_TEST_LLM_URL=http://127.0.0.1:${LLAMA_PORT}" >> "$GITHUB_ENV" + + # Serial: the agents share one CPU-bound server. Builds each per-agent image + # on first use (cached thereafter). + - name: Live tests + run: cargo test --test live -- --ignored --nocapture --test-threads=1 + + - name: Stop llama.cpp server + if: always() + run: podman rm -f agentcap-llama || true diff --git a/tests/live.rs b/tests/live.rs new file mode 100644 index 0000000..501db6a --- /dev/null +++ b/tests/live.rs @@ -0,0 +1,149 @@ +//! Live end-to-end tests: drive the real `agentcap run` binary through a real +//! OpenAI-compatible server for each agent, asserting the wire path (the agent +//! reaches the model through the proxy and the turn completes) — not task +//! quality. Ports `test_cli_live.py` + `test_drivers_live.py`. +//! +//! `#[ignore]` by default so `cargo test` stays hermetic. The `Test - Live` +//! workflow provisions a llama.cpp server + builds the per-agent images, then +//! runs `cargo test --test live -- --ignored`. Run locally with a server up: +//! AGENTCAP_TEST_LLM_URL=http://127.0.0.1:8000 cargo test --test live -- --ignored +//! Each test skips (passes) if no server is reachable. + +use std::process::Command; +use std::time::Duration; + +use serde_json::{json, Value}; + +/// Resolve a reachable llama upstream: `$AGENTCAP_TEST_LLM_URL`, else a server +/// already on :8000/:8080. `None` → no server, skip. +fn upstream() -> Option { + if let Ok(u) = std::env::var("AGENTCAP_TEST_LLM_URL") { + if !u.trim().is_empty() { + return Some(u.trim().trim_end_matches('/').to_string()); + } + } + let client = reqwest::blocking::Client::builder() + .timeout(Duration::from_secs(2)) + .build() + .ok()?; + for port in [8000, 8080] { + let url = format!("http://127.0.0.1:{port}"); + if client + .get(format!("{url}/v1/models")) + .send() + .map(|r| r.status().is_success()) + .unwrap_or(false) + { + return Some(url); + } + } + None +} + +/// `agentcap run --agent ` against the live server; assert the run dir, +/// run.json shape, captures, and (for pi) the streamed JSONL trace. +fn run_agent(agent: &str, expect_jsonl_traces: bool) { + let Some(upstream) = upstream() else { + eprintln!("skip live[{agent}]: no llama server (set AGENTCAP_TEST_LLM_URL or run one on :8000/:8080)"); + return; + }; + let model = std::env::var("AGENTCAP_TEST_MODEL").unwrap_or_else(|_| "Qwen3-1.7B".to_string()); + + let tmp = tempfile::tempdir().unwrap(); + let ws = tmp.path().join("ws"); + std::fs::create_dir(&ws).unwrap(); + let tasks = tmp.path().join("tasks.txt"); + std::fs::write(&tasks, "Say hello in one short sentence, then stop.\n").unwrap(); + + let out = Command::new(env!("CARGO_BIN_EXE_agentcap")) + .args([ + "run", + "--agent", + agent, + "--model", + &model, + "--upstream", + &upstream, + "--tasks", + tasks.to_str().unwrap(), + "--turns", + "1", + "--timeout", + "600", + ]) + .env("AGENTCAP_WORKSPACE", &ws) + .output() + .expect("spawn agentcap"); + assert!( + out.status.success(), + "agentcap run --agent {agent} failed (exit {:?})\n--- stderr ---\n{}", + out.status.code(), + String::from_utf8_lossy(&out.stderr) + ); + + // Exactly one run dir for this agent. + let run_dir = std::fs::read_dir(ws.join(".agentcap")) + .unwrap() + .filter_map(|e| e.ok().map(|e| e.path())) + .find(|p| { + p.file_name() + .and_then(|n| n.to_str()) + .is_some_and(|n| n.starts_with(&format!("{agent}-"))) + }) + .expect("a run dir under .agentcap"); + + let summary: Value = serde_json::from_slice(&std::fs::read(run_dir.join("run.json")).unwrap()).unwrap(); + assert_eq!(summary["agent"], json!(agent)); + assert_eq!(summary["model"], json!(model)); + assert_eq!(summary["upstream"], json!(upstream)); + assert_eq!(summary["turns_per_task"], json!(1)); + let task = &summary["tasks"][0]; + assert_eq!( + task["completed_turns"], + json!(1), + "wire path: agent didn't complete the turn" + ); + assert!( + task["session_id"].as_str().is_some_and(|s| !s.is_empty()), + "{agent} should mint a session id; run.json: {summary}" + ); + + // Captures landed via the in-process proxy. + let n_caps = std::fs::read_dir(run_dir.join("captures")) + .unwrap() + .filter_map(|e| e.ok()) + .filter(|e| e.file_name().to_str().is_some_and(|n| n.ends_with(".request.json"))) + .count(); + assert!(n_caps > 0, "proxy should have captured at least one request"); + + if expect_jsonl_traces { + let has_jsonl = std::fs::read_dir(run_dir.join("traces")) + .unwrap() + .filter_map(|e| e.ok()) + .any(|e| e.path().extension().is_some_and(|x| x == "jsonl")); + assert!(has_jsonl, "{agent} should have streamed at least one .jsonl trace"); + } +} + +#[test] +#[ignore = "live: needs a model server + podman"] +fn live_pi() { + // pi streams native session JSONL through the in-container symlink. + run_agent("pi", true); +} + +#[test] +#[ignore = "live: needs a model server + podman"] +fn live_hermes() { + run_agent("hermes", false); +} + +#[test] +#[ignore = "live: needs a model server + podman"] +fn live_goose() { + run_agent("goose", false); +} + +// opencode is intentionally omitted: opencode 1.15.x doesn't pick up the baked +// `agent.minimal` from the per-agent image (fails "agent minimal not found"), +// matching the `@pytest.mark.skip` on `test_opencode_live`. From e7ac69c3d613bc0fb3280edc9ec34030b965f540 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Thu, 25 Jun 2026 09:05:43 +0000 Subject: [PATCH 2/5] test(live): add failure diagnostics; scope CI to pi while debugging --- .github/workflows/live.yml | 4 +++- tests/live.rs | 32 +++++++++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/.github/workflows/live.yml b/.github/workflows/live.yml index 7023677..9cb8ed0 100644 --- a/.github/workflows/live.yml +++ b/.github/workflows/live.yml @@ -100,7 +100,9 @@ jobs: # Serial: the agents share one CPU-bound server. Builds each per-agent image # on first use (cached thereafter). - name: Live tests - run: cargo test --test live -- --ignored --nocapture --test-threads=1 + # TODO: scoped to pi while diagnosing the first live run; restore to the + # full set (`cargo test --test live`) once green. + run: cargo test --test live live_pi -- --ignored --nocapture --test-threads=1 - name: Stop llama.cpp server if: always() diff --git a/tests/live.rs b/tests/live.rs index 501db6a..63cca75 100644 --- a/tests/live.rs +++ b/tests/live.rs @@ -40,6 +40,35 @@ fn upstream() -> Option { None } +/// Last `n` chars of `s`, for failure dumps. +fn tail(s: &str, n: usize) -> String { + let start = s.char_indices().rev().take(n).last().map(|(i, _)| i).unwrap_or(0); + s[start..].to_string() +} + +/// Dump everything useful when a turn doesn't complete: run.json, the agentcap +/// binary's stderr (orchestrator `[turn_done] rc=…` / `[task_aborted] reason=…`), +/// and each per-turn session log (the agent's own stdout/stderr). +fn diagnostics(run_dir: &std::path::Path, summary: &Value, bin_stderr: &[u8]) -> String { + let mut out = format!("--- run.json ---\n{summary:#}\n"); + out.push_str(&format!( + "--- agentcap stderr (tail) ---\n{}\n", + tail(&String::from_utf8_lossy(bin_stderr), 4000) + )); + if let Ok(rd) = std::fs::read_dir(run_dir.join("sessions")) { + for e in rd.flatten() { + let p = e.path(); + let body = std::fs::read_to_string(&p).unwrap_or_default(); + out.push_str(&format!( + "--- sessions/{} (tail) ---\n{}\n", + p.file_name().unwrap().to_string_lossy(), + tail(&body, 2000) + )); + } + } + out +} + /// `agentcap run --agent ` against the live server; assert the run dir, /// run.json shape, captures, and (for pi) the streamed JSONL trace. fn run_agent(agent: &str, expect_jsonl_traces: bool) { @@ -101,7 +130,8 @@ fn run_agent(agent: &str, expect_jsonl_traces: bool) { assert_eq!( task["completed_turns"], json!(1), - "wire path: agent didn't complete the turn" + "wire path: agent didn't complete the turn\n{}", + diagnostics(&run_dir, &summary, &out.stderr) ); assert!( task["session_id"].as_str().is_some_and(|s| !s.is_empty()), From 5fcb2a7e0bc18d692ddbf2140148a5e61a18286b Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Thu, 25 Jun 2026 09:26:57 +0000 Subject: [PATCH 3/5] fix(proxy): raise the per-request timeout off reqwest's 30s blocking default reqwest::blocking defaults to a 30s total-request timeout; a slow streamed generation (e.g. an agent turn on a CPU runner) blows past it, so the proxy's upstream read errors mid-stream and the agent's turn never completes. Cap at a generous-but-finite 900s instead (synth follow-up: 300s). Restores the live workflow to all agents. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/live.yml | 4 +--- src/followups/synthesized.rs | 9 ++++++++- src/proxy/mod.rs | 8 ++++++-- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/.github/workflows/live.yml b/.github/workflows/live.yml index 9cb8ed0..7023677 100644 --- a/.github/workflows/live.yml +++ b/.github/workflows/live.yml @@ -100,9 +100,7 @@ jobs: # Serial: the agents share one CPU-bound server. Builds each per-agent image # on first use (cached thereafter). - name: Live tests - # TODO: scoped to pi while diagnosing the first live run; restore to the - # full set (`cargo test --test live`) once green. - run: cargo test --test live live_pi -- --ignored --nocapture --test-threads=1 + run: cargo test --test live -- --ignored --nocapture --test-threads=1 - name: Stop llama.cpp server if: always() diff --git a/src/followups/synthesized.rs b/src/followups/synthesized.rs index 3d80b00..67e1eaf 100644 --- a/src/followups/synthesized.rs +++ b/src/followups/synthesized.rs @@ -5,6 +5,8 @@ //! server directly so the captured corpus stays a clean record of agent↔model //! interaction. Any failure falls back to `"continue"` (logged). +use std::time::Duration; + use reqwest::blocking::Client; use serde_json::{json, Value}; @@ -44,7 +46,12 @@ impl SynthesizedFollowUp { model: model.to_string(), api_key: api_key.map(str::to_string), fallback: "continue".to_string(), - client: Client::new(), + // Generous cap (blocking's 30s default is too short for a slow synth); + // falls back to "continue" if it still times out. + client: Client::builder() + .timeout(Duration::from_secs(300)) + .build() + .unwrap_or_default(), } } diff --git a/src/proxy/mod.rs b/src/proxy/mod.rs index fb1a7e3..5785d24 100644 --- a/src/proxy/mod.rs +++ b/src/proxy/mod.rs @@ -66,8 +66,12 @@ struct CaptureProxy { impl CaptureProxy { fn new(upstream: &str, capture_dir: PathBuf) -> Result { std::fs::create_dir_all(&capture_dir).with_context(|| format!("creating {}", capture_dir.display()))?; - // No timeout: agent calls can be long; the agent decides when to give up. - let client = Client::builder().build().context("building HTTP client")?; + // Generous per-request cap: blocking's 30s default truncates slow streamed + // generations; finite so a hung upstream can't wedge a worker forever. + let client = Client::builder() + .timeout(Duration::from_secs(900)) + .build() + .context("building HTTP client")?; Ok(CaptureProxy { upstream: upstream.trim_end_matches('/').to_string(), capture_dir, From acb4caadeca997d67183e610f594086a9785ce2d Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Thu, 25 Jun 2026 09:41:17 +0000 Subject: [PATCH 4/5] test(live): drop hermes from CLI live tests (needs prompt-shrink flags not on `run`) hermes' base prompt exceeds the tiny CI model's budget and bails before any model call; the Python suite only ran hermes at the driver level with ignore_rules/toolsets, which `agentcap run` doesn't expose. Keep pi + goose, which cover the full stack across both trace mechanisms. --- tests/live.rs | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tests/live.rs b/tests/live.rs index 63cca75..f55ccb6 100644 --- a/tests/live.rs +++ b/tests/live.rs @@ -162,18 +162,20 @@ fn live_pi() { run_agent("pi", true); } -#[test] -#[ignore = "live: needs a model server + podman"] -fn live_hermes() { - run_agent("hermes", false); -} - #[test] #[ignore = "live: needs a model server + podman"] fn live_goose() { run_agent("goose", false); } -// opencode is intentionally omitted: opencode 1.15.x doesn't pick up the baked -// `agent.minimal` from the per-agent image (fails "agent minimal not found"), -// matching the `@pytest.mark.skip` on `test_opencode_live`. +// hermes and opencode are intentionally omitted — neither runs via `agentcap run` +// on the tiny CI model: +// - hermes: its base system prompt (~3.9k tokens) exceeds the budget on +// Qwen3-1.7B, so it bails before any model call. The Python suite never ran +// hermes through the CLI either — `test_hermes_live` drove the driver directly +// with prompt-shrinking flags (`ignore_rules`, `toolsets="file"`) that `run` +// doesn't expose. hermes stdout parsing is covered by unit tests. +// - opencode: 1.15.x doesn't pick up the baked `agent.minimal` from the image +// (matching the `@pytest.mark.skip` on `test_opencode_live`). +// pi (symlink/JSONL traces) + goose (dump-traces/SQLite) cover the full stack +// across both trace-surfacing mechanisms. From 22dbc3929260a14e3180f5853c13fe2d5307391e Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Thu, 25 Jun 2026 10:13:24 +0000 Subject: [PATCH 5/5] fix(proxy): lower per-request timeout cap 900s -> 300s --- src/proxy/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/proxy/mod.rs b/src/proxy/mod.rs index 5785d24..811508e 100644 --- a/src/proxy/mod.rs +++ b/src/proxy/mod.rs @@ -66,10 +66,10 @@ struct CaptureProxy { impl CaptureProxy { fn new(upstream: &str, capture_dir: PathBuf) -> Result { std::fs::create_dir_all(&capture_dir).with_context(|| format!("creating {}", capture_dir.display()))?; - // Generous per-request cap: blocking's 30s default truncates slow streamed - // generations; finite so a hung upstream can't wedge a worker forever. + // Per-request cap above blocking's 30s default (too short for a slow streamed + // generation), but finite so a hung upstream can't wedge a worker. let client = Client::builder() - .timeout(Duration::from_secs(900)) + .timeout(Duration::from_secs(300)) .build() .context("building HTTP client")?; Ok(CaptureProxy {