From 990f9d009ff4613a3e55ef24aae6f719a9f6386b Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Thu, 25 Jun 2026 14:35:51 +0000 Subject: [PATCH 1/2] feat(run,export): add --label to tag a run and export it as a column `run --label ` records a free-form label in run.json; export emits it as a constant `label` column (read from run.json the same way as `agent`). Lets a comparison sweep tag each run with its condition (e.g. a bare/clone/skill tier) so cells stay identifiable in `agentcap ls`, inspect, and the published dataset. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/export.rs | 11 +++++++++++ src/main.rs | 10 +++++++++- src/run.rs | 9 ++++++++- 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/export.rs b/src/export.rs index a31f509..2cfb1b6 100644 --- a/src/export.rs +++ b/src/export.rs @@ -337,6 +337,12 @@ fn agent_from_run_json(run_dir: &Path) -> Option { rec.get("agent").and_then(Value::as_str).map(str::to_string) } +/// Optional free-form `label` recorded by `run --label` (e.g. a sweep tier). +fn label_from_run_json(run_dir: &Path) -> Option { + let rec = read_json(&run_dir.join("run.json")).ok()?; + rec.get("label").and_then(Value::as_str).map(str::to_string) +} + fn push_captures_dataset( client: &HFClientSync, items: &[CapItem], @@ -370,6 +376,11 @@ fn push_captures_dataset( for (i, item) in items.iter().enumerate() { let mut extra = detect_provider_columns(&item.capture_dir); let provider = extra.iter().find(|(k, _)| k == "provider").map(|(_, v)| v.clone()); + if let Some(run_dir) = item.capture_dir.parent() { + if let Some(label) = label_from_run_json(run_dir) { + extra.push(("label".to_string(), label)); + } + } extra.push(("run_id".to_string(), item.run_id.clone())); let filename = default_filename(item.agent.as_deref(), Some(&item.model), provider.as_deref()); let local = tmp.path().join(format!("{i}-{filename}")); diff --git a/src/main.rs b/src/main.rs index 5d750b2..535eb98 100644 --- a/src/main.rs +++ b/src/main.rs @@ -13,6 +13,9 @@ struct Cli { } #[derive(Subcommand)] +// `Run` carries the full run config (many args); this enum is parsed once at +// startup, not stored in bulk, so the variant-size spread doesn't matter. +#[allow(clippy::large_enum_variant)] enum Cmd { /// Drive an agent through a corpus, capturing every chat-completion. Run { @@ -38,6 +41,10 @@ enum Cmd { /// bind-mounted read-only and its `bin/` prepended to the agent's PATH. #[arg(long)] tool_dir: Option, + /// Free-form label recorded in run.json and exported as a column + /// (e.g. a tier/condition tag for a comparison sweep). + #[arg(long)] + label: Option, /// Plain-text file: one prompt per line (# comments + blanks ignored). #[arg(long)] tasks: String, @@ -92,12 +99,13 @@ fn main() -> Result<()> { sandbox, skills, tool_dir, + label, tasks, turns, followup, timeout, } => agentcap::run::run( - agent, model, upstream, api_key, sandbox, skills, tool_dir, tasks, turns, followup, timeout, + agent, model, upstream, api_key, sandbox, skills, tool_dir, label, tasks, turns, followup, timeout, ), Cmd::Ls { workspace, long } => agentcap::ls::run(workspace, long), Cmd::Export { diff --git a/src/run.rs b/src/run.rs index 396f684..7e40b72 100644 --- a/src/run.rs +++ b/src/run.rs @@ -26,6 +26,7 @@ pub fn run( sandbox_dir: Option, skills_dir: Option, tool_dir: Option, + label: Option, tasks_file: String, turns: i64, followup: String, @@ -68,6 +69,7 @@ pub fn run( &upstream, turns, &followup, + label.as_deref(), &[], )?; @@ -155,6 +157,7 @@ pub fn run( &upstream, turns, &followup, + label.as_deref(), &results, )?; let n_ok = results.iter().filter(|r| r.completed_turns() as i64 == turns).count(); @@ -227,6 +230,7 @@ fn write_run_json( upstream: &str, turns: i64, followup: &str, + label: Option<&str>, results: &[TaskResult], ) -> Result<()> { let tasks: Vec<_> = results @@ -245,10 +249,13 @@ fn write_run_json( }) }) .collect(); - let summary = json!({ + let mut summary = json!({ "agent": agent, "model": model, "provider": provider, "upstream": upstream, "turns_per_task": turns, "followup": followup, "tasks": tasks, }); + if let Some(l) = label { + summary["label"] = json!(l); + } std::fs::write(workdir.join("run.json"), serde_json::to_string_pretty(&summary)?)?; Ok(()) } From 94ec4a846aeca9b8fed295d6dfc0057847eb09a2 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Thu, 25 Jun 2026 10:41:12 +0000 Subject: [PATCH 2/2] feat(examples): add transformers-agentic corpus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port of huggingface/is-it-agentic-enough: 16 prompts that each ask an agent to run a named Hugging Face model and report the result (sentiment, NER, QA, transcription, captioning, …), across the bare/clone/skill assistance tiers. build-toolenv.sh builds a self-contained, relocatable transformers bundle inside ubuntu:24.04 (the agent-image base) — pinned to the unreleased agentic-CLI commit, CPU torch — and prewarms every corpus model into a shared cache read offline at run time. run.sh mounts it via --tool-dir and seeds the per-tier sandbox. Used to compare models/agents through agentcap's capture path; match/marker scoring stays with the upstream harness. Co-Authored-By: Claude Opus 4.8 (1M context) --- examples/transformers-agentic/.gitignore | 9 ++ examples/transformers-agentic/README.md | 69 ++++++++++ .../transformers-agentic/build-toolenv.sh | 121 ++++++++++++++++++ examples/transformers-agentic/run.sh | 95 ++++++++++++++ .../skill/agents/AGENTS.md | 17 +++ .../skill/skills/transformers/SKILL.md | 113 ++++++++++++++++ examples/transformers-agentic/tasks.txt | 25 ++++ 7 files changed, 449 insertions(+) create mode 100644 examples/transformers-agentic/.gitignore create mode 100644 examples/transformers-agentic/README.md create mode 100755 examples/transformers-agentic/build-toolenv.sh create mode 100755 examples/transformers-agentic/run.sh create mode 100644 examples/transformers-agentic/skill/agents/AGENTS.md create mode 100644 examples/transformers-agentic/skill/skills/transformers/SKILL.md create mode 100644 examples/transformers-agentic/tasks.txt diff --git a/examples/transformers-agentic/.gitignore b/examples/transformers-agentic/.gitignore new file mode 100644 index 0000000..54a4dca --- /dev/null +++ b/examples/transformers-agentic/.gitignore @@ -0,0 +1,9 @@ +# Build artefacts and run outputs — never commit (the bundle alone is ~16 GB). +# Anchored to this dir so they don't match like-named subdirs (e.g. the skill's +# own transformers/ folder). +/toolenv/ +/transformers/ +/inputs/ +/sandbox-*/ +/xet-test/ +/.agentcap/ diff --git a/examples/transformers-agentic/README.md b/examples/transformers-agentic/README.md new file mode 100644 index 0000000..d908bef --- /dev/null +++ b/examples/transformers-agentic/README.md @@ -0,0 +1,69 @@ +# transformers-agentic + +agentcap port of the [`is-it-agentic-enough`](https://github.com/huggingface/is-it-agentic-enough) +task suite (the [blog post](https://huggingface.co/blog/is-it-agentic-enough)): +16 prompts that each ask an agent to run a **named** Hugging Face model +(classify sentiment, transcribe audio, caption an image, …) and report the +result. Because each task pins a specific model, the agent has to actually +load and run it rather than answer from world knowledge. + +Here it's used to **compare models/agents** through agentcap's capture path — +not to reproduce the article's scoring. agentcap records the agent ↔ model +wire traffic; match %, token, and CLI-vs-`pipeline()` marker analysis are the +upstream harness's job (the captures contain what's needed to compute them +later). + +## How the agent actually runs transformers + +The agent's task work executes **inside the podman sandbox**, which ships only +the agent CLI — no transformers. Rather than rebuild the images, a +self-contained, relocatable `transformers` bundle is mounted read-only via +`agentcap run --tool-dir` and put on the agent's PATH: + +```bash +./build-toolenv.sh # one-time: builds ./toolenv/ + prewarms the model cache +``` + +`build-toolenv.sh` builds the bundle **inside `ubuntu:24.04`** — the base of +every agentcap agent image — so the venv's interpreter and torch `.so`s are +ABI-identical when mounted into any sandbox. It pins the exact transformers +commit that carries the (still unreleased) agentic CLI, installs CPU torch, and +prewarms every corpus model into `./toolenv/hf-cache/`. The venv configures +itself to use that cache — a `.pth` points `HF_HOME` at it (resolved from the +venv root, so it holds wherever the bundle is mounted) and defaults to offline — +so runs read models from the read-only mount with no network or re-downloads. + +## Tiers (the article's discovery conditions) + +| `--tier` | what the agent gets | +|---|---| +| `bare` | empty cwd; only the mounted `transformers` bundle | +| `clone` | cwd is a git worktree of `./transformers` @ the bundle's commit (AGENTS.md / `cli/agentic/*.py` auto-discover) | +| `skill` | empty cwd + the packaged transformers Skill (`./skill`) in context | + +## Run + +```bash +# server: any OpenAI-compat /v1 on $UPSTREAM (default http://127.0.0.1:8001) +./run.sh --agent pi --model unsloth/GLM-4.5-Air-GGUF --tier skill +./run.sh --agent hermes --model unsloth/GLM-4.5-Air-GGUF --tier bare +``` + +`./run.sh --help` for the env knobs. It pins `AGENTCAP_WORKSPACE` here, so runs +live under `./.agentcap/` — list them with `agentcap ls` from this directory, and +publish with `agentcap export --push /`. + +`tasks.txt` is the full 16-task corpus; pass `--tasks ` to run a subset. + +## Caveats vs. the article + +- **One cwd per (agent, model, tier) run**, reused across the corpus's tasks + (agentcap runs a corpus in a single sandbox), where the article isolates each + task in its own worktree. File writes from one task can persist into the next. +- The agentic CLI is unreleased; the bundle pins commit + `4d15b215f3` (`is-it-agentic-enough`'s "w/ CLI + Skill" ref). +- Prewarm uses the classic HTTPS backend (`HF_HUB_DISABLE_XET=1`): xet stalled + once on a transient CAS hiccup during a long bulk download, and HTTPS is + steadier for a one-shot prewarm. (xet into the bind-mounted cache itself works + fine — verified; it's not a mount problem.) Runs read the cache offline, so + xet is never invoked at run time regardless. diff --git a/examples/transformers-agentic/build-toolenv.sh b/examples/transformers-agentic/build-toolenv.sh new file mode 100755 index 0000000..23fd392 --- /dev/null +++ b/examples/transformers-agentic/build-toolenv.sh @@ -0,0 +1,121 @@ +#!/usr/bin/env bash +# One-time builder for the self-contained `transformers` bundle the corpus +# mounts via `agentcap run --tool-dir`. Everything (interpreter, torch, the +# agentic-CLI transformers, and a prewarmed model cache) lives under ./toolenv, +# built INSIDE ubuntu:24.04 — the exact base of every agentcap agent image — so +# the venv's /usr/bin/python3.12 base and torch .so's are ABI-identical when the +# bundle is mounted (read-only) into any agent sandbox. +# +# ./toolenv/ relocatable venv (bin/transformers, bin/python, lib/) +# ./toolenv/hf-cache/ prewarmed HF cache; the venv points HF_HOME here +# ./transformers/ transformers checkout @ PINNED_SHA (clone-tier source) +# ./inputs/ corpus inputs (cat.jpg, sample.wav), fetched from the blog repo +# +# Re-run to prewarm any missing models; the heavy build is skipped if ./toolenv +# already has a working `transformers`. Pass HF_TOKEN for faster, rate-limit-free +# downloads. The agentic CLI is unreleased, so we pin the exact commit. + +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)" + +# transformers @ the "agent-first CLI" effort (is-it-agentic-enough's +# `4d15b215f3` / "w/ CLI + Skill"). Not on main, not in any release. +PINNED_SHA="4d15b215f37bcb25a2d6472b2147e34b3d465186" + +# Every model named in the corpus (tasks.txt). Prewarmed so runs read offline. +MODELS=" +distilbert/distilbert-base-uncased-finetuned-sst-2-english +dslim/bert-base-NER +openai/whisper-tiny +llava-hf/llava-interleave-qwen-0.5b-hf +HuggingFaceTB/SmolLM2-360M-Instruct +facebook/bart-large-cnn +distilbert/distilbert-base-cased-distilled-squad +distilbert/distilbert-base-uncased +facebook/bart-large-mnli +google/vit-base-patch16-224 +facebook/detr-resnet-50 +laion/clap-htsat-unfused +Helsinki-NLP/opus-mt-en-fr +" + +# Corpus inputs (the cat image + audio clip the tasks reference) live in the +# is-it-agentic-enough repo; fetch them at a pinned commit instead of vendoring +# binaries here. Idempotent — skipped if already present. +INPUTS_SHA="1655d61abf056c58ee2bc8682cb2f0d336ce31ae" +INPUTS_URL="https://raw.githubusercontent.com/huggingface/is-it-agentic-enough/${INPUTS_SHA}/src/ae/data/inputs" +mkdir -p "$HERE/inputs" +for f in cat.jpg sample.wav; do + [ -f "$HERE/inputs/$f" ] || { echo ">>> fetching input $f"; curl -fsSL "$INPUTS_URL/$f" -o "$HERE/inputs/$f"; } +done + +podman run --rm -i \ + -e PINNED_SHA="$PINNED_SHA" -e MODELS="$MODELS" \ + -e HF_TOKEN="${HF_TOKEN:-}" \ + -e HF_HUB_DISABLE_XET=1 \ + -v "$HERE:$HERE" -w "$HERE" \ + ubuntu:24.04 bash -s <<'IN_CONTAINER' +set -e +export DEBIAN_FRONTEND=noninteractive +apt-get update -q >/dev/null +apt-get install -y -q --no-install-recommends python3 python3-venv python3-pip git ca-certificates >/dev/null +HERE="$(pwd)"; TE="$HERE/toolenv"; TFSRC="$HERE/transformers" +# Prewarm into the bundle's cache, explicitly online (the .pth written below +# makes the venv default to offline at run time; setdefault leaves these be). +export HF_HOME="$TE/hf-cache" HF_HUB_OFFLINE=0 TRANSFORMERS_OFFLINE=0 + +if [ ! -x "$TE/bin/transformers" ]; then + echo ">>> fetching transformers @ $PINNED_SHA" + rm -rf "$TFSRC"; mkdir -p "$TFSRC"; cd "$TFSRC" + git init -q; git remote add origin https://github.com/huggingface/transformers + git fetch -q --depth 1 origin "$PINNED_SHA"; git checkout -q FETCH_HEAD + cd "$HERE" + echo ">>> building venv + CPU torch + transformers + task deps" + python3 -m venv "$TE" + "$TE/bin/pip" install -q --no-cache-dir --upgrade pip + "$TE/bin/pip" install -q --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu + "$TE/bin/pip" install -q --no-cache-dir "$TFSRC" + "$TE/bin/pip" install -q --no-cache-dir timm pillow sentencepiece sacremoses librosa soundfile scipy accelerate protobuf openai +else + echo ">>> toolenv present; skipping build" +fi + +# Self-configuring bundle: a .pth points HF_HOME at the in-bundle hf-cache, +# resolved from the venv root (sys.prefix) so it holds wherever the bundle is +# mounted read-only. The agent invokes the bundle's python/transformers, so HF +# reads the prewarmed cache offline with no per-sandbox env setup. (Ubuntu venvs +# don't auto-import sitecustomize, hence the .pth + helper module.) +SP="$("$TE/bin/python" -c 'import sysconfig; print(sysconfig.get_path("purelib"))')" +cat > "$SP/_agentcap_hf_home.py" <<'PY' +import os +import sys + +_cache = os.path.join(sys.prefix, "hf-cache") +if os.path.isdir(_cache): + os.environ.setdefault("HF_HOME", _cache) + os.environ.setdefault("HF_HUB_OFFLINE", "1") + os.environ.setdefault("TRANSFORMERS_OFFLINE", "1") +PY +echo 'import _agentcap_hf_home' > "$SP/_agentcap_hf_home.pth" + +echo ">>> sanity: CLI + pipeline import" +"$TE/bin/transformers" --help >/dev/null && echo " transformers CLI OK" +"$TE/bin/python" -c "from transformers import pipeline" && echo " pipeline import OK" + +echo ">>> prewarming model cache (xet disabled)" +for m in $MODELS; do + printf ' %-58s ' "$m" + if "$TE/bin/python" - "$m" <<'PY' 2>/tmp/dl.err +import sys +from huggingface_hub import snapshot_download +# PyTorch + safetensors only; skip the TF/Flax/ONNX/Rust/GGUF weight copies +# transformers never loads (they triple the download for no benefit). +snapshot_download(sys.argv[1], ignore_patterns=[ + "*.h5", "tf_model*", "*.msgpack", "flax_model*", "*.onnx", "onnx/**", + "*.tflite", "rust_model.ot", "*.gguf", +]) +PY + then echo "ok"; else echo "FAILED"; tail -2 /tmp/dl.err; fi +done +echo ">>> DONE. bundle at $TE ($(du -sh "$TE" | cut -f1))" +IN_CONTAINER diff --git a/examples/transformers-agentic/run.sh b/examples/transformers-agentic/run.sh new file mode 100755 index 0000000..0857179 --- /dev/null +++ b/examples/transformers-agentic/run.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# Drive the transformers-agentic corpus (the `is-it-agentic-enough` task suite) +# through any registered agent, in one of three assistance tiers. Each task +# names a specific HF model the agent must actually load and run, so the agent +# needs a runnable `transformers` — provided by a self-contained, relocatable +# bundle mounted via `agentcap run --tool-dir` (build it once with +# ./build-toolenv.sh). The agent's own model is served on $UPSTREAM as usual. +# +# Tiers (the article's bare/clone/skill discovery conditions): +# bare empty cwd; only the mounted transformers bundle is available. +# clone cwd is a detached git worktree of ./transformers @ the bundle's +# commit, so AGENTS.md / cli/agentic/*.py auto-discover from cwd. +# skill empty cwd + the packaged transformers Skill (./skill) in context. +# +# Usage: +# ./run.sh --agent --model [--tier bare|clone|skill] [--tasks ] +# +# Examples: +# ./run.sh --agent pi --model unsloth/GLM-4.5-Air-GGUF --tier skill +# ./run.sh --agent hermes --model unsloth/GLM-4.5-Air-GGUF --tier bare +# +# Captures land under $HERE/.agentcap//; publish with `agentcap export`. +# +# Env knobs: +# UPSTREAM model server URL http://127.0.0.1:8001 +# TURNS turns per task 1 +# FOLLOWUP continue | templates | synthesized continue +# TIMEOUT per-turn timeout (seconds) 900 + +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)" +export AGENTCAP_WORKSPACE="$HERE" + +AGENT="" MODEL="" TIER="bare" TASKS="$HERE/tasks.txt" +while [[ $# -gt 0 ]]; do + case "$1" in + --agent) AGENT="$2"; shift 2 ;; + --model) MODEL="$2"; shift 2 ;; + --tier) TIER="$2"; shift 2 ;; + --tasks) TASKS="$2"; shift 2 ;; + -h|--help) sed -n '/^# Usage:/,/^set -euo/p' "$0" | sed 's/^# \?//; /^set -euo/d'; exit 0 ;; + *) echo "ERROR: unexpected arg: $1" >&2; exit 2 ;; + esac +done +[[ -n "$AGENT" && -n "$MODEL" ]] || { echo "ERROR: --agent and --model are required. See: $0 --help" >&2; exit 2; } +[[ "$TIER" =~ ^(bare|clone|skill)$ ]] || { echo "ERROR: --tier must be bare|clone|skill" >&2; exit 2; } +[[ -f "$TASKS" ]] || { echo "ERROR: tasks file not found: $TASKS" >&2; exit 2; } + +UPSTREAM="${UPSTREAM:-http://127.0.0.1:8001}" +TURNS="${TURNS:-1}" +FOLLOWUP="${FOLLOWUP:-continue}" +TIMEOUT="${TIMEOUT:-900}" + +TOOLENV="$HERE/toolenv" +[[ -x "$TOOLENV/bin/transformers" ]] || { + echo "ERROR: transformers bundle missing at $TOOLENV. Build it first:" >&2 + echo " ./build-toolenv.sh" >&2 + exit 2 +} + +# Per-tier sandbox cwd, rebuilt fresh each invocation, with inputs/ seeded. +SANDBOX="$HERE/sandbox-$TIER" +if [[ -e "$SANDBOX/.git" ]]; then + git -C "$HERE/transformers" worktree remove --force "$SANDBOX" 2>/dev/null || true +fi +rm -rf "$SANDBOX" +if [[ "$TIER" == "clone" ]]; then + [[ -d "$HERE/transformers/.git" ]] || { echo "ERROR: clone tier needs $HERE/transformers (built by ./build-toolenv.sh)" >&2; exit 2; } + SHA="$(git -C "$HERE/transformers" rev-parse HEAD)" + git -C "$HERE/transformers" worktree add --detach "$SANDBOX" "$SHA" >/dev/null +else + mkdir -p "$SANDBOX" +fi +cp -r "$HERE/inputs" "$SANDBOX/inputs" + +# Only the skill tier passes --skills; empty otherwise. Expanded set-u-safe below +# (bash 3.2 treats "${arr[@]}" of an empty array as an unbound-variable error). +skill_args=() +[[ "$TIER" == "skill" ]] && skill_args=(--skills "$HERE/skill") + +echo ">>> agent=$AGENT model=$MODEL tier=$TIER tasks=$(basename "$TASKS") upstream=$UPSTREAM" >&2 +agentcap run \ + --agent "$AGENT" \ + --model "$MODEL" \ + --upstream "$UPSTREAM" \ + --sandbox "$SANDBOX" \ + --tool-dir "$TOOLENV" \ + --label "$TIER" \ + "${skill_args[@]+"${skill_args[@]}"}" \ + --tasks "$TASKS" \ + --turns "$TURNS" \ + --followup "$FOLLOWUP" \ + --timeout "$TIMEOUT" + +echo "done. captures under $HERE/.agentcap/ (agentcap ls). publish: agentcap export" >&2 diff --git a/examples/transformers-agentic/skill/agents/AGENTS.md b/examples/transformers-agentic/skill/agents/AGENTS.md new file mode 100644 index 0000000..c5de1e5 --- /dev/null +++ b/examples/transformers-agentic/skill/agents/AGENTS.md @@ -0,0 +1,17 @@ + + +You have additional SKILLs documented in directories containing a "SKILL.md" file. + +These skills are: + - transformers -> "skills/transformers/SKILL.md" + +IMPORTANT: You MUST read the SKILL.md file whenever the description of the skills matches the user intent, or may help accomplish their task. + + + +transformers: `Run one-off Hugging Face Transformers inference from the command line — classify, ner, qa, fill-mask, summarize, translate, tokenize, caption, image-classify, detect, vqa, transcribe, audio-classify, generate, and more. Use this skill whenever a task asks you to run a named model on text, an image, or audio: invoke the `transformers` CLI (e.g. `transformers --format json classify --text "..." --model ...`) rather than hand-writing a Python `pipeline(...)` script. Run `transformers --help` for the full command list.` + + +Paths referenced within SKILL folders are relative to that SKILL. + + diff --git a/examples/transformers-agentic/skill/skills/transformers/SKILL.md b/examples/transformers-agentic/skill/skills/transformers/SKILL.md new file mode 100644 index 0000000..c748397 --- /dev/null +++ b/examples/transformers-agentic/skill/skills/transformers/SKILL.md @@ -0,0 +1,113 @@ +--- +name: transformers +description: Agent-invokable Transformers commands. Pass `--format json` at the top level (e.g. `transformers --format json classify ...`) to receive the structured output documented in each capability's `outputs` schema. +--- + +# Transformers CLI + +For one-off inference, training, quantization, or export, invoke the +`transformers` command directly rather than writing Python. Run +`transformers --help` for the full command list; run +`transformers --help` for flags per command. + +## Invocation rules + +**All inputs are named flags, never positional.** Wrong invocations like +``transformers classify "my text"`` or ``transformers ner "sentence"`` will +fail with ``Got unexpected extra argument``. The text / image / audio / file +argument is always a flag: ``--text``, ``--image``, ``--audio``, ``--file``. + +**Always invoke as `transformers ...`.** Do not use +``python -m transformers ...`` patterns — the console script is what the +``transformers`` package installs. + +**Use `transformers --format json` for machine-readable output**: +``transformers --format json classify --text "..."``. + +## Example invocations (copy these shapes) + +Text (classify, ner, token-classify, summarize, translate, fill-mask): +``` +transformers classify --text "I loved this movie" +transformers classify --text "..." --model distilbert/distilbert-base-uncased-finetuned-sst-2-english +transformers ner --text "Apple CEO Tim Cook visited Paris." --model dslim/bert-base-NER +transformers summarize --file article.txt --model facebook/bart-large-cnn +transformers translate --text "The weather is nice" --model Helsinki-NLP/opus-mt-en-de +``` + +Question answering (takes `--question` and `--context`): +``` +transformers qa --question "Who invented it?" --context "Graham Bell invented the telephone in 1876." +``` + +Image (caption, image-classify, detect, segment, depth, vqa, ocr): +``` +transformers caption --image photo.jpg --model llava-hf/llava-interleave-qwen-0.5b-hf +transformers image-classify --image photo.jpg +transformers vqa --image photo.jpg --question "What color is the car?" +``` + +Audio (transcribe, audio-classify, speak): +``` +transformers transcribe --audio clip.wav --model openai/whisper-tiny +transformers audio-classify --audio clip.wav +transformers speak --text "Hello" --output hello.wav +``` + +Tokenize / inspect / embed: +``` +transformers tokenize --model HuggingFaceTB/SmolLM2-360M-Instruct --text "tokenize me" +transformers inspect meta-llama/Llama-3.2-1B-Instruct +transformers embed --text "some sentence" --model BAAI/bge-small-en-v1.5 +``` + +Generate (text completion): +``` +transformers generate --prompt "Once upon a time" --model HuggingFaceTB/SmolLM2-360M-Instruct +``` + +## Available commands + +- `transformers classify` — Classify text into categories +- `transformers ner` — Extract named entities from text (NER) +- `transformers token-classify` — Tag tokens with labels (POS tagging, chunking, etc.) +- `transformers qa` — Answer a question given a context paragraph (extractive QA) +- `transformers table-qa` — Answer a question about tabular data (CSV) +- `transformers summarize` — Summarize text +- `transformers translate` — Translate text between languages +- `transformers fill-mask` — Predict the masked token in a sentence +- `transformers image-classify` — Classify an image +- `transformers detect` — Detect objects in an image +- `transformers segment` — Segment an image +- `transformers depth` — Estimate a depth map from an image +- `transformers keypoints` — Match keypoints between two images +- `transformers video-classify` — Classify a video +- `transformers transcribe` — Transcribe speech from an audio file +- `transformers audio-classify` — Classify an audio file into categories +- `transformers speak` — Synthesize speech from text and save to a WAV file +- `transformers audio-generate` — Generate audio (e.g. music) from a text description and save to a WAV file +- `transformers vqa` — Visual question answering using ``AutoModelForImageTextToText`` +- `transformers document-qa` — Extractive document question answering using +- `transformers caption` — Generate a caption for an image using ``AutoModelForImageTextToText`` +- `transformers ocr` — Extract text from an image using ``AutoModelForImageTextToText`` +- `transformers multimodal-chat` — Single-turn conversation with a model that accepts mixed inputs +- `transformers generate` — Generate text from a prompt with full control over decoding +- `transformers detect-watermark` — Detect whether text contains a watermark +- `transformers embed` — Compute embeddings for text or images +- `transformers tokenize` — Tokenize text and display the resulting tokens +- `transformers inspect` — Inspect a model's configuration without downloading weights +- `transformers inspect-forward` — Examine attention weights and hidden states from a forward pass +- `transformers benchmark-quantization` — Compare quality and performance across quantization methods +- `transformers train` — Fine-tune or pretrain a model on a dataset +- `transformers quantize` — Quantize a model and save it +- `transformers export` — Export a model to a deployment-friendly format + +## When to use what + +- **Atomic task** (single inference / training / export): use the CLI. +- **Composed workflow** (chain models, custom logic): write Python. + The CLI commands' source in `transformers.cli.agentic.*` is the + canonical template — each file loads a model with `AutoModel*` + + `AutoProcessor`/`AutoTokenizer`, runs a forward pass, and + post-processes. Copy that pattern rather than reaching for + `pipeline(...)`. diff --git a/examples/transformers-agentic/tasks.txt b/examples/transformers-agentic/tasks.txt new file mode 100644 index 0000000..bd90034 --- /dev/null +++ b/examples/transformers-agentic/tasks.txt @@ -0,0 +1,25 @@ +# transformers-agentic corpus — 16 prompts from huggingface/is-it-agentic-enough +# (src/ae/data/transformers.yaml), one prompt per line as agentcap requires. +# Each task names a specific HF model so the agent must actually load and run +# it (otherwise it answers from world knowledge and measures nothing). Order +# matches the source: task_01..task_16. Inputs (cat.jpg, sample.wav) are seeded +# into the agent's cwd under ./inputs/ by run.sh. +# +# atomic (13): one agentic-CLI command covers it in the "after" state. +Using the model distilbert/distilbert-base-uncased-finetuned-sst-2-english, classify the sentiment of this sentence and report the label and score: "I absolutely loved the movie, it was fantastic!" +Using the model dslim/bert-base-NER, extract the named entities from this sentence and report them with their types: "Apple CEO Tim Cook visited Paris last week to meet President Macron." +Using the model openai/whisper-tiny, transcribe the audio file at ./inputs/sample.wav and report the transcript. +Using the model llava-hf/llava-interleave-qwen-0.5b-hf, caption the image at ./inputs/cat.jpg and report the caption. +Using the tokenizer from HuggingFaceTB/SmolLM2-360M-Instruct, tokenize the sentence "The quick brown fox jumps over the lazy dog." and tell me how many tokens it is. +Using the model facebook/bart-large-cnn, summarize this article in one or two sentences: The James Webb Space Telescope has captured unprecedented images of distant galaxies, revealing structures that formed less than a billion years after the Big Bang. Scientists say the observations, released today, challenge existing models of early galaxy formation and suggest that massive galaxies assembled far more quickly than previously thought. The data will be made publicly available next month. +Using distilbert/distilbert-base-cased-distilled-squad, answer this question from the given context and report the answer. Question: "Who invented the telephone?" Context: "Alexander Graham Bell invented the telephone in 1876." +Using distilbert/distilbert-base-uncased, fill in the masked token in this sentence and report the top prediction: "The capital of France is [MASK]." +Using facebook/bart-large-mnli, classify this sentence into exactly one of the labels technology, sports, or cooking, and report the chosen label: "The new GPU massively speeds up deep-learning training." +Using google/vit-base-patch16-224, classify the image at ./inputs/cat.jpg and report the top predicted label. +Using facebook/detr-resnet-50, detect the objects in the image at ./inputs/cat.jpg and report the detected labels. +Using llava-hf/llava-interleave-qwen-0.5b-hf, answer this question about the image at ./inputs/cat.jpg: "What animal is in the image?" +Using HuggingFaceTB/SmolLM2-360M-Instruct, generate a short (one or two sentence) continuation of this prompt and report the generated text: "Once upon a time, in a small village by the sea," +Using the zero-shot audio classifier laion/clap-htsat-unfused, classify the audio file at ./inputs/sample.wav into one of speech, music, or noise, and report the chosen label. +# compositional (2): no single CLI command fits; the agent must write Python. +Transcribe the audio file at ./inputs/sample.wav using openai/whisper-tiny, then classify the sentiment of the transcript using distilbert/distilbert-base-uncased-finetuned-sst-2-english. Report both the transcript and the sentiment. +Caption the image at ./inputs/cat.jpg using llava-hf/llava-interleave-qwen-0.5b-hf, then translate the caption to French using Helsinki-NLP/opus-mt-en-fr. Report both the English caption and the French translation.