Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
36cd0b2
feat(multiturn): add kimi agentic benchmark controls
hvagadia Jun 3, 2026
3425267
feat: add multi-turn benchmark controls
hvagadia Jun 3, 2026
0a13672
fix: skip disabled multi-turn inline accuracy
hvagadia Jun 3, 2026
6c7ad2d
feat(bfcl): add BFCL v4 single-turn + multi-turn accuracy integration
Palanivelg May 29, 2026
ec4832c
feat(bfcl): per-category sampling + tiny-subset floor + Thor example
Palanivelg May 29, 2026
5bef6d7
docs(bfcl): generalize "Thor" to "edge device" in BFCL v4 example
Palanivelg May 29, 2026
b547ae1
fix(bfcl): guard rt_settings access in accuracy-only mode
Palanivelg May 29, 2026
4507af9
fix(bfcl): per-phase drain timeout; accuracy phases drain unbounded
Palanivelg May 29, 2026
ca02361
fix(bfcl): stop duplicating tool_calls in TextModelOutput.output
Palanivelg May 29, 2026
41e0132
fix(bfcl): score against structured tool_calls to tolerate prose prea…
Palanivelg May 29, 2026
a2c4849
fix(bfcl): send tool_choice="auto" for single-turn function calling
Palanivelg May 29, 2026
bed0b6f
feat(bfcl): forward seed parameter through adapters and MT runner
Palanivelg Jun 5, 2026
cd70253
docs(bfcl): add reproduction guide with Thor validation results
Palanivelg Jun 5, 2026
90948b7
docs(bfcl): rewrite README as a beginner-friendly step-by-step guide
Palanivelg Jun 5, 2026
7ef3495
docs(bfcl): update README with corrected Thor validation data
Palanivelg Jun 5, 2026
ab8b591
fix(bfcl): add soundfile dependency for qwen_agent transitive import
Palanivelg Jun 5, 2026
37c74d2
refactor(bfcl): rename example folder to 10_Edge_Agentic_Example; add…
Palanivelg Jun 8, 2026
723e96a
fix(bfcl): address code review issues in execution, runner, and scorer
Palanivelg Jun 8, 2026
d628805
docs(bfcl): annotate YAML with do-not-change submission rules
Palanivelg Jun 8, 2026
4f2a21d
fix(bfcl): resolve CI failures — numpy pin, templates, mypy, audit
Palanivelg Jun 8, 2026
c9cb448
fix(bfcl): honor configured accuracy_timeout_s for accuracy phase drain
Palanivelg Jun 8, 2026
daae8b7
fix(bfcl): run_accuracy.sh — render temp config instead of invalid flags
Palanivelg Jun 8, 2026
d6c10c8
Merge branch 'pr-331' into feat/edge-agentic-perf
Palanivelg Jun 8, 2026
c6a3eee
feat(examples): add edge agentic-coding performance benchmark with in…
Palanivelg Jun 11, 2026
9f6c775
feat(examples): add edge agentic-coding 1h/2.5h perf datasets + configs
Palanivelg Jun 15, 2026
795c369
docs(edge-agentic): finalize single-turn ~995 accuracy + DGX Spark notes
Palanivelg Jun 22, 2026
55ebd61
docs(edge-agentic): add model-acquisition section with canonical source
Palanivelg Jun 22, 2026
676c1d8
chore(edge-agentic): remove orphaned perf configs and datasets
Palanivelg Jun 22, 2026
8b32f4f
Merge origin/main into feat/edge-agentic-perf
Palanivelg Jun 23, 2026
0bc51d0
fix(deps): bump msgpack 1.1.2 -> 1.2.1 to clear pip-audit GHSA-6v7p-g…
Palanivelg Jun 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
416 changes: 416 additions & 0 deletions examples/10_Edge_Agentic_Example/README.md

Large diffs are not rendered by default.

966 changes: 966 additions & 0 deletions examples/10_Edge_Agentic_Example/agentic_coding_1h.jsonl

Large diffs are not rendered by default.

2,014 changes: 2,014 additions & 0 deletions examples/10_Edge_Agentic_Example/agentic_coding_2.5h.jsonl

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# BFCL v4 single-turn accuracy — edge device (~3h) budget
#
# Runs the three single-turn BFCL v4 categories (non_live, live, hallucination)
# through the accuracy pipeline with per-category sampling tuned to draw ~995
# samples — large enough that the point estimate is stable across draws while
# still finishing on an edge device in ~3 hours. This single-turn run is the
# finalized accuracy benchmark.
#
# For final submission, fields marked "do not change" must be kept as-is.
# Fields marked "set to your value" must be updated for your environment.
#
# Run:
# inference-endpoint benchmark from-config \
# --config offline_bfcl_v4_single_turn.yaml --accuracy-only
#
# Requires: pip install -e ".[bfcl]"
name: "bfcl-v4-single-turn-accuracy"
version: "1.0" # do not change.
type: "offline" # do not change.
timeout: 10800 # do not change.

model_params:
name: "Qwen3.6-27B-Q4_K_M" # set to your served model name.
temperature: 0 # do not change.

datasets:
- name: bfcl_v4::function_calling # do not change.
type: "accuracy" # do not change.
params:
categories: ["non_live", "live", "hallucination"] # do not change.
category_sample_pct:
non_live: 62 # do not change.
live: 10 # do not change.
hallucination: 10 # do not change.
# Subsets with total size <= 25 are always taken in full so their scores
# are not reduced to one or two noisy samples.
subset_floor: 25 # do not change.
accuracy_config:
eval_method: "bfcl_v4" # do not change.
ground_truth: "ground_truth" # do not change.
extractor: "function_call_extractor" # do not change.
num_repeats: 1 # do not change.

settings:
runtime:
min_duration_ms: 0 # do not change.
dataloader_random_seed: 42 # do not change.
load_pattern:
type: "max_throughput" # do not change.
client:
num_workers: 1 # do not change.
max_connections: 1 # do not change.

endpoint_config:
endpoints:
- "http://localhost:8080" # set to your endpoint URL.
api_key: null

report_dir: results/bfcl_v4_single_turn_accuracy/ # do not change.
75 changes: 75 additions & 0 deletions examples/10_Edge_Agentic_Example/online_agentic_coding_1h.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Agentic coding performance benchmark — edge device, ~1 h subset (inline accuracy)
#
# Replays recorded multi-turn agentic-coding trajectories (SWE-bench-style) as a
# *performance* workload (throughput / TTFT / TPOT / latency) against the same
# served endpoint used for BFCL v4 accuracy, and runs an inline "online checker"
# that scores the replayed trajectories against the recorded tool calls in the
# dataset (accuracy_config.eval_method: agentic_inference_inline). The dataset is
# therefore both the performance workload and its own ground truth.
#
# Dataset: agentic_coding_1h.jsonl — 9 deep conversations (483 generated turns),
# one per source repository (django, sympy, scikit-learn, sphinx, matplotlib,
# xarray, astropy, pytest, requests), each selected so its peak input sequence
# length stays under the 32K served context. No conversation overflows 32K, so
# every turn completes and the run is *valid* (0 dropped turns). Sized for a
# ~1 h single-stream pass on a ~10-12 tok/s edge box (e.g. Jetson Thor,
# Qwen3.6-27B Q4_K_M, reasoning off). This is the quick "smoke" variant; use
# online_agentic_coding_2.5h.yaml for a larger, statistically solid run.
#
# Reference (NVIDIA Jetson AGX Thor, Q4_K_M + llama.cpp, reasoning off): ~73 min,
# 483/483 turns, 0 dropped, inline IoU 0.6189.
#
# Run (start the edge server first; see README.md):
# inference-endpoint benchmark from-config \
# --config online_agentic_coding_1h.yaml
name: "agentic-coding-perf-1h"
version: "1.0"
type: "online"

model_params:
name: "Qwen3.6-27B-Q4_K_M" # replace with your served model name
# Deterministic decoding so the inline accuracy check is reproducible. Reasoning
# is disabled server-side (llama-server --reasoning off): on this tool-calling
# workload reasoning gives no inline-IoU benefit and costs ~60% more wall-clock.
temperature: 0
seed: 42
max_new_tokens: 1024

datasets:
- name: agentic_coding
type: performance
path: examples/10_Edge_Agentic_Example/agentic_coding_1h.jsonl
accuracy_config:
# Inline "online checker": score the replayed performance outputs against
# the recorded tool calls (multiset IoU of executables). The dataset is
# both the performance workload and its own ground truth.
eval_method: agentic_inference_inline
agentic_inference:
# Per-turn deadline; a timeout aborts that turn and the rest of its
# conversation. 10 min is generous for slow edge decode.
turn_timeout_s: 600.0
# One pass over the 9 conversations (no trajectory repeats).
num_trajectories_to_issue: 9

settings:
runtime:
min_duration_ms: 0
# Safety cap (2 h) so the run stays bounded even if decode is slower than
# expected; one pass should finish in ~1 h on an edge box.
max_duration_ms: 7200000
load_pattern:
type: agentic_inference
# Single-stream: one in-flight conversation at a time, matching a single-slot
# edge server (llama.cpp -np 1). Raise only for a multi-slot endpoint.
target_concurrency: 1
client:
num_workers: 1
max_connections: 1
warmup_connections: 0

endpoint_config:
endpoints:
- "http://localhost:8080"
api_type: openai

report_dir: results/agentic_coding_perf_1h/
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Agentic coding performance benchmark — edge device, ~2.5 h subset (inline accuracy)
#
# Replays recorded multi-turn agentic-coding trajectories (SWE-bench-style) as a
# *performance* workload (throughput / TTFT / TPOT / latency) against the same
# served endpoint used for BFCL v4 accuracy, and runs an inline "online checker"
# that scores the replayed trajectories against the recorded tool calls in the
# dataset (accuracy_config.eval_method: agentic_inference_inline). The dataset is
# therefore both the performance workload and its own ground truth.
#
# Dataset: agentic_coding_2.5h.jsonl — 20 deep conversations (1007 generated
# turns) spread across 9 source repositories, each selected so its peak input
# sequence length stays under the 32K served context (measured peak ISL ~23.5K,
# leaving headroom for the 1024-token output cap). No conversation overflows
# 32K, so every turn completes and the run is *valid* (0 dropped turns). This is
# the recommended reference performance run: ~2× the sample of the 1 h variant,
# giving a more stable inline-IoU estimate, sized for a ~2.5 h single-stream
# pass on a ~10-12 tok/s edge box (e.g. Jetson Thor, Qwen3.6-27B Q4_K_M,
# reasoning off); with serving optimizations (e.g. MTP speculative decoding)
# this drops toward ~1.5 h.
#
# Reference (NVIDIA Jetson AGX Thor, Q4_K_M + llama.cpp, reasoning off): ~2 h 37 m,
# 1007/1007 turns, 0 dropped, inline IoU 0.6335.
#
# Run (start the edge server first; see README.md):
# inference-endpoint benchmark from-config \
# --config online_agentic_coding_2.5h.yaml
name: "agentic-coding-perf-2.5h"
version: "1.0"
type: "online"

model_params:
name: "Qwen3.6-27B-Q4_K_M" # replace with your served model name
# Deterministic decoding so the inline accuracy check is reproducible. Reasoning
# is disabled server-side (llama-server --reasoning off): on this tool-calling
# workload reasoning gives no inline-IoU benefit and costs ~60% more wall-clock.
temperature: 0
seed: 42
max_new_tokens: 1024

datasets:
- name: agentic_coding
type: performance
path: examples/10_Edge_Agentic_Example/agentic_coding_2.5h.jsonl
accuracy_config:
# Inline "online checker": score the replayed performance outputs against
# the recorded tool calls (multiset IoU of executables). The dataset is
# both the performance workload and its own ground truth.
eval_method: agentic_inference_inline
agentic_inference:
# Per-turn deadline; a timeout aborts that turn and the rest of its
# conversation. 10 min is generous for slow edge decode.
turn_timeout_s: 600.0
# One pass over the 20 conversations (no trajectory repeats).
num_trajectories_to_issue: 20

settings:
runtime:
min_duration_ms: 0
# Safety cap (4 h) so the run stays bounded even if decode is slower than
# expected; one pass should finish in ~2.5 h on an edge box.
max_duration_ms: 14400000
load_pattern:
type: agentic_inference
# Single-stream: one in-flight conversation at a time, matching a single-slot
# edge server (llama.cpp -np 1). Raise only for a multi-slot endpoint.
target_concurrency: 1
client:
num_workers: 1
max_connections: 1
warmup_connections: 0

endpoint_config:
endpoints:
- "http://localhost:8080"
api_type: openai

report_dir: results/agentic_coding_perf_2.5h/
47 changes: 47 additions & 0 deletions examples/10_Edge_Agentic_Example/run_accuracy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env bash
# Reproduce the BFCL v4 edge-agentic accuracy reference result (~3 h on an edge
# device). The finalized accuracy benchmark is single-turn only, sampled to ~995
# samples (see offline_bfcl_v4_single_turn.yaml).
#
# Usage:
# 1. Edit MODEL and ENDPOINT below to match your server.
# 2. bash run_accuracy.sh
#
# Results are written to:
# results/bfcl_v4_single_turn_accuracy/ (single-turn)
#
# Multi-turn is no longer part of the accuracy gate; see README Step 3 for the
# optional exploratory multi-turn run.

set -euo pipefail

MODEL="${MODEL:-Qwen3.6-27B-Q4_K_M}"
ENDPOINT="${ENDPOINT:-http://localhost:8080}"

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"

echo "=== BFCL v4 edge-agentic accuracy run ==="
echo " Model: $MODEL"
echo " Endpoint: $ENDPOINT"
echo ""

# from-config reads model name and endpoint from the YAML (it has no
# model/endpoint override flags). Render a temp config with MODEL/ENDPOINT
# substituted in so the env vars above take effect without editing the
# committed YAML; the trailing "# set to your ..." comments anchor the edit.
ST_CONFIG="$(mktemp --suffix=.yaml)"
trap 'rm -f "$ST_CONFIG"' EXIT
sed -E \
-e "s|^( *name: ).*(# set to your served model name\.)|\1\"${MODEL}\" \2|" \
-e "s|^( *- ).*(# set to your endpoint URL\.)|\1\"${ENDPOINT}\" \2|" \
offline_bfcl_v4_single_turn.yaml > "$ST_CONFIG"

# Single-turn: non_live (62%), live (10%), hallucination (10%) — ~995 samples, ~3 h
echo "--- Single-turn (~995 samples, ~3 h) ---"
inference-endpoint benchmark from-config \
--config "$ST_CONFIG" \
--accuracy-only

echo ""
echo "=== Done. Results in results/bfcl_v4_single_turn_accuracy/ ==="
27 changes: 26 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,16 @@ environments = [
"sys_platform == 'darwin' and platform_machine == 'x86_64'",
"sys_platform == 'darwin' and platform_machine == 'arm64'",
]
# bfcl-eval hard-pins an old dependency set (numpy==1.26.4, filelock, etc.).
# Mark bfcl as conflicting with the tooling extras so uv resolves it in its own
# fork; otherwise those old pins drag shared dev/test/performance deps (filelock,
# virtualenv) down to versions with known CVEs. CI installs the tooling extras
# without bfcl; bfcl is installed standalone (pip install -e ".[bfcl]").
conflicts = [
[{ extra = "bfcl" }, { extra = "dev" }],
[{ extra = "bfcl" }, { extra = "test" }],
[{ extra = "bfcl" }, { extra = "performance" }],
]

[tool.uv.build-backend]
module-root = "src"
Expand Down Expand Up @@ -59,7 +69,7 @@ dependencies = [
"transformers==5.5.0",
# Required by transformers' apply_chat_template
"jinja2==3.1.6",
"numpy==2.4.4",
"numpy>=1.26.4",
"datasets==4.8.4",
"Pillow==12.2.0",
"sentencepiece==0.2.1",
Expand Down Expand Up @@ -95,6 +105,13 @@ dev = [
"myst-parser==5.0.0",
# Security auditing
"pip-audit==2.10.0",
# bfcl-eval hard-pins filelock==3.20.0, which uv would otherwise share across
# every resolution fork. Because bfcl conflicts with this extra (see
# [tool.uv].conflicts), these floors force uv to fork filelock/virtualenv:
# patched here, pinned only inside the bfcl fork. Closes CVE-2025-68146 /
# CVE-2026-22701 (filelock) and CVE-2026-22702 (virtualenv).
"filelock>=3.20.3",
"virtualenv>=20.36.1",
]
test = [
# Includes optional dependencies for full test coverage
Expand Down Expand Up @@ -122,6 +139,14 @@ performance = [
"pytest-benchmark==5.2.3",
"memory-profiler==0.61.0",
]
bfcl = [
# BFCL v4 function-calling evaluation. Pins numpy==1.26.4, which is why
# the top-level numpy requirement is a lower bound (>=1.26.4).
"bfcl-eval==2026.3.23",
# bfcl-eval's qwen model handler transitively imports qwen_agent → soundfile;
# soundfile is not used by our scorer but must be present for the import to succeed.
"soundfile==0.13.1",
]

[project.scripts]
inference-endpoint = "inference_endpoint.main:run"
Expand Down
Loading
Loading