Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions .github/workflows/live.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
name: Test - Live

# Full agent×model end-to-end: spin a real llama.cpp server, build the per-agent
# sandbox images on demand, and drive `agentcap run` against the server for each
# agent (pi/hermes/goose). Heavy (GGUF download + image builds + CPU inference) —
# this is agentcap's "live" tier, the Rust port of `linux-live-tests.yml`.

on:
push:
branches: [main]
pull_request:
branches: [main]
paths:
- "src/**"
- "tests/**"
- "containers/**"
- "Cargo.toml"
- "Cargo.lock"
- ".github/workflows/live.yml"
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

permissions:
contents: read

env:
CARGO_TERM_COLOR: always
RUST_BACKTRACE: "1"
# Pinned to match tests/conftest.py (the proven Python live setup).
GGUF_REPO: Qwen/Qwen3-1.7B-GGUF
GGUF_FILE: Qwen3-1.7B-Q8_0.gguf
LLAMA_IMAGE: ghcr.io/ggml-org/llama.cpp:server-b9487
LLAMA_PORT: "8080"

jobs:
live:
name: Live (agent × model)
runs-on: ubuntu-latest
timeout-minutes: 60
steps:
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3

- name: Install podman
run: sudo apt-get update && sudo apt-get install -y podman

- uses: dtolnay/rust-toolchain@e081816240890017053eacbb1bdf337761dc5582 # 1.95.0

- uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2
with:
shared-key: live

# GGUF weights (~1.8 GB). Cache so only the first run downloads.
- name: Cache GGUF
uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: ~/models
key: gguf-${{ env.GGUF_FILE }}-v1

- name: Download GGUF (on cache miss)
run: |
mkdir -p "$HOME/models"
if [ ! -f "$HOME/models/${GGUF_FILE}" ]; then
curl -sSfL -o "$HOME/models/${GGUF_FILE}" \
"https://huggingface.co/${GGUF_REPO}/resolve/main/${GGUF_FILE}"
fi

# Per-agent sandbox images are built on demand by `agentcap run`; cache the
# rootless container store so `ensure_image` short-circuits on hash match.
- name: Cache sandbox images
uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: ~/.local/share/containers
key: sandbox-images-${{ hashFiles('containers/**') }}
restore-keys: |
sandbox-images-

- name: Start llama.cpp server
run: |
podman run --rm -d --name agentcap-llama \
-p "127.0.0.1:${LLAMA_PORT}:8080" \
--mount "type=bind,src=$HOME/models,dst=/models,ro" \
"${LLAMA_IMAGE}" \
--model "/models/${GGUF_FILE}" \
--host 0.0.0.0 --port 8080 \
--ctx-size 8192 --reasoning-format none --jinja
echo "Waiting for llama serve…"
for i in $(seq 1 180); do
if curl -sf "http://127.0.0.1:${LLAMA_PORT}/v1/models" >/dev/null; then
echo "ready after ${i}s"; break
fi
sleep 1
done
curl -sf "http://127.0.0.1:${LLAMA_PORT}/v1/models" >/dev/null \
|| { echo "llama serve never became ready"; podman logs agentcap-llama | tail -50; exit 1; }
echo "AGENTCAP_TEST_LLM_URL=http://127.0.0.1:${LLAMA_PORT}" >> "$GITHUB_ENV"

# Serial: the agents share one CPU-bound server. Builds each per-agent image
# on first use (cached thereafter).
- name: Live tests
run: cargo test --test live -- --ignored --nocapture --test-threads=1

- name: Stop llama.cpp server
if: always()
run: podman rm -f agentcap-llama || true
9 changes: 8 additions & 1 deletion src/followups/synthesized.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
//! server directly so the captured corpus stays a clean record of agent↔model
//! interaction. Any failure falls back to `"continue"` (logged).

use std::time::Duration;

use reqwest::blocking::Client;
use serde_json::{json, Value};

Expand Down Expand Up @@ -44,7 +46,12 @@ impl SynthesizedFollowUp {
model: model.to_string(),
api_key: api_key.map(str::to_string),
fallback: "continue".to_string(),
client: Client::new(),
// Generous cap (blocking's 30s default is too short for a slow synth);
// falls back to "continue" if it still times out.
client: Client::builder()
.timeout(Duration::from_secs(300))
.build()
.unwrap_or_default(),
}
}

Expand Down
8 changes: 6 additions & 2 deletions src/proxy/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,12 @@ struct CaptureProxy {
impl CaptureProxy {
fn new(upstream: &str, capture_dir: PathBuf) -> Result<Self> {
std::fs::create_dir_all(&capture_dir).with_context(|| format!("creating {}", capture_dir.display()))?;
// No timeout: agent calls can be long; the agent decides when to give up.
let client = Client::builder().build().context("building HTTP client")?;
// Per-request cap above blocking's 30s default (too short for a slow streamed
// generation), but finite so a hung upstream can't wedge a worker.
let client = Client::builder()
.timeout(Duration::from_secs(300))
.build()
.context("building HTTP client")?;
Ok(CaptureProxy {
upstream: upstream.trim_end_matches('/').to_string(),
capture_dir,
Expand Down
181 changes: 181 additions & 0 deletions tests/live.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
//! Live end-to-end tests: drive the real `agentcap run` binary through a real
//! OpenAI-compatible server for each agent, asserting the wire path (the agent
//! reaches the model through the proxy and the turn completes) — not task
//! quality. Ports `test_cli_live.py` + `test_drivers_live.py`.
//!
//! `#[ignore]` by default so `cargo test` stays hermetic. The `Test - Live`
//! workflow provisions a llama.cpp server + builds the per-agent images, then
//! runs `cargo test --test live -- --ignored`. Run locally with a server up:
//! AGENTCAP_TEST_LLM_URL=http://127.0.0.1:8000 cargo test --test live -- --ignored
//! Each test skips (passes) if no server is reachable.

use std::process::Command;
use std::time::Duration;

use serde_json::{json, Value};

/// Resolve a reachable llama upstream: `$AGENTCAP_TEST_LLM_URL`, else a server
/// already on :8000/:8080. `None` → no server, skip.
fn upstream() -> Option<String> {
if let Ok(u) = std::env::var("AGENTCAP_TEST_LLM_URL") {
if !u.trim().is_empty() {
return Some(u.trim().trim_end_matches('/').to_string());
}
}
let client = reqwest::blocking::Client::builder()
.timeout(Duration::from_secs(2))
.build()
.ok()?;
for port in [8000, 8080] {
let url = format!("http://127.0.0.1:{port}");
if client
.get(format!("{url}/v1/models"))
.send()
.map(|r| r.status().is_success())
.unwrap_or(false)
{
return Some(url);
}
}
None
}

/// Last `n` chars of `s`, for failure dumps.
fn tail(s: &str, n: usize) -> String {
let start = s.char_indices().rev().take(n).last().map(|(i, _)| i).unwrap_or(0);
s[start..].to_string()
}

/// Dump everything useful when a turn doesn't complete: run.json, the agentcap
/// binary's stderr (orchestrator `[turn_done] rc=…` / `[task_aborted] reason=…`),
/// and each per-turn session log (the agent's own stdout/stderr).
fn diagnostics(run_dir: &std::path::Path, summary: &Value, bin_stderr: &[u8]) -> String {
let mut out = format!("--- run.json ---\n{summary:#}\n");
out.push_str(&format!(
"--- agentcap stderr (tail) ---\n{}\n",
tail(&String::from_utf8_lossy(bin_stderr), 4000)
));
if let Ok(rd) = std::fs::read_dir(run_dir.join("sessions")) {
for e in rd.flatten() {
let p = e.path();
let body = std::fs::read_to_string(&p).unwrap_or_default();
out.push_str(&format!(
"--- sessions/{} (tail) ---\n{}\n",
p.file_name().unwrap().to_string_lossy(),
tail(&body, 2000)
));
}
}
out
}

/// `agentcap run --agent <agent>` against the live server; assert the run dir,
/// run.json shape, captures, and (for pi) the streamed JSONL trace.
fn run_agent(agent: &str, expect_jsonl_traces: bool) {
let Some(upstream) = upstream() else {
eprintln!("skip live[{agent}]: no llama server (set AGENTCAP_TEST_LLM_URL or run one on :8000/:8080)");
return;
};
let model = std::env::var("AGENTCAP_TEST_MODEL").unwrap_or_else(|_| "Qwen3-1.7B".to_string());

let tmp = tempfile::tempdir().unwrap();
let ws = tmp.path().join("ws");
std::fs::create_dir(&ws).unwrap();
let tasks = tmp.path().join("tasks.txt");
std::fs::write(&tasks, "Say hello in one short sentence, then stop.\n").unwrap();

let out = Command::new(env!("CARGO_BIN_EXE_agentcap"))
.args([
"run",
"--agent",
agent,
"--model",
&model,
"--upstream",
&upstream,
"--tasks",
tasks.to_str().unwrap(),
"--turns",
"1",
"--timeout",
"600",
])
.env("AGENTCAP_WORKSPACE", &ws)
.output()
.expect("spawn agentcap");
assert!(
out.status.success(),
"agentcap run --agent {agent} failed (exit {:?})\n--- stderr ---\n{}",
out.status.code(),
String::from_utf8_lossy(&out.stderr)
);

// Exactly one run dir for this agent.
let run_dir = std::fs::read_dir(ws.join(".agentcap"))
.unwrap()
.filter_map(|e| e.ok().map(|e| e.path()))
.find(|p| {
p.file_name()
.and_then(|n| n.to_str())
.is_some_and(|n| n.starts_with(&format!("{agent}-")))
})
.expect("a run dir under .agentcap");

let summary: Value = serde_json::from_slice(&std::fs::read(run_dir.join("run.json")).unwrap()).unwrap();
assert_eq!(summary["agent"], json!(agent));
assert_eq!(summary["model"], json!(model));
assert_eq!(summary["upstream"], json!(upstream));
assert_eq!(summary["turns_per_task"], json!(1));
let task = &summary["tasks"][0];
assert_eq!(
task["completed_turns"],
json!(1),
"wire path: agent didn't complete the turn\n{}",
diagnostics(&run_dir, &summary, &out.stderr)
);
assert!(
task["session_id"].as_str().is_some_and(|s| !s.is_empty()),
"{agent} should mint a session id; run.json: {summary}"
);

// Captures landed via the in-process proxy.
let n_caps = std::fs::read_dir(run_dir.join("captures"))
.unwrap()
.filter_map(|e| e.ok())
.filter(|e| e.file_name().to_str().is_some_and(|n| n.ends_with(".request.json")))
.count();
assert!(n_caps > 0, "proxy should have captured at least one request");

if expect_jsonl_traces {
let has_jsonl = std::fs::read_dir(run_dir.join("traces"))
.unwrap()
.filter_map(|e| e.ok())
.any(|e| e.path().extension().is_some_and(|x| x == "jsonl"));
assert!(has_jsonl, "{agent} should have streamed at least one .jsonl trace");
}
}

#[test]
#[ignore = "live: needs a model server + podman"]
fn live_pi() {
// pi streams native session JSONL through the in-container symlink.
run_agent("pi", true);
}

#[test]
#[ignore = "live: needs a model server + podman"]
fn live_goose() {
run_agent("goose", false);
}

// hermes and opencode are intentionally omitted — neither runs via `agentcap run`
// on the tiny CI model:
// - hermes: its base system prompt (~3.9k tokens) exceeds the budget on
// Qwen3-1.7B, so it bails before any model call. The Python suite never ran
// hermes through the CLI either — `test_hermes_live` drove the driver directly
// with prompt-shrinking flags (`ignore_rules`, `toolsets="file"`) that `run`
// doesn't expose. hermes stdout parsing is covered by unit tests.
// - opencode: 1.15.x doesn't pick up the baked `agent.minimal` from the image
// (matching the `@pytest.mark.skip` on `test_opencode_live`).
// pi (symlink/JSONL traces) + goose (dump-traces/SQLite) cover the full stack
// across both trace-surfacing mechanisms.
Loading