mlcommons · Palanivelg · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026 · May 29, 2026
@@ -0,0 +1,59 @@
+# BFCL v4 single-turn accuracy — edge device (~3h) budget
+#
+# Runs the three single-turn BFCL v4 categories (non_live, live, hallucination)
+# through the accuracy pipeline with per-category sampling tuned to draw ~995
+# samples — large enough that the point estimate is stable across draws while
+# still finishing on an edge device in ~3 hours. This single-turn run is the
+# finalized accuracy benchmark.
+#
+# For final submission, fields marked "do not change" must be kept as-is.
+# Fields marked "set to your value" must be updated for your environment.
+#
+# Run:
+#   inference-endpoint benchmark from-config \
+#     --config offline_bfcl_v4_single_turn.yaml --accuracy-only
+#
+# Requires: pip install -e ".[bfcl]"
+name: "bfcl-v4-single-turn-accuracy"
+version: "1.0" # do not change.
+type: "offline" # do not change.
+timeout: 10800 # do not change.
+
+model_params:
+  name: "Qwen3.6-27B-Q4_K_M" # set to your served model name.
+  temperature: 0 # do not change.
+
+datasets:
+  - name: bfcl_v4::function_calling # do not change.
+    type: "accuracy" # do not change.
+    params:
+      categories: ["non_live", "live", "hallucination"] # do not change.
+      category_sample_pct:
+        non_live: 62 # do not change.
+        live: 10 # do not change.
+        hallucination: 10 # do not change.
+      # Subsets with total size <= 25 are always taken in full so their scores
+      # are not reduced to one or two noisy samples.
+      subset_floor: 25 # do not change.
+    accuracy_config:
+      eval_method: "bfcl_v4" # do not change.
+      ground_truth: "ground_truth" # do not change.
+      extractor: "function_call_extractor" # do not change.
+      num_repeats: 1 # do not change.
+
+settings:
+  runtime:
+    min_duration_ms: 0 # do not change.
+    dataloader_random_seed: 42 # do not change.
+  load_pattern:
+    type: "max_throughput" # do not change.
+  client:
+    num_workers: 1 # do not change.
+    max_connections: 1 # do not change.
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:8080" # set to your endpoint URL.
+  api_key: null
+
+report_dir: results/bfcl_v4_single_turn_accuracy/ # do not change.
@@ -0,0 +1,75 @@
+# Agentic coding performance benchmark — edge device, ~1 h subset (inline accuracy)
+#
+# Replays recorded multi-turn agentic-coding trajectories (SWE-bench-style) as a
+# *performance* workload (throughput / TTFT / TPOT / latency) against the same
+# served endpoint used for BFCL v4 accuracy, and runs an inline "online checker"
+# that scores the replayed trajectories against the recorded tool calls in the
+# dataset (accuracy_config.eval_method: agentic_inference_inline). The dataset is
+# therefore both the performance workload and its own ground truth.
+#
+# Dataset: agentic_coding_1h.jsonl — 9 deep conversations (483 generated turns),
+# one per source repository (django, sympy, scikit-learn, sphinx, matplotlib,
+# xarray, astropy, pytest, requests), each selected so its peak input sequence
+# length stays under the 32K served context. No conversation overflows 32K, so
+# every turn completes and the run is *valid* (0 dropped turns). Sized for a
+# ~1 h single-stream pass on a ~10-12 tok/s edge box (e.g. Jetson Thor,
+# Qwen3.6-27B Q4_K_M, reasoning off). This is the quick "smoke" variant; use
+# online_agentic_coding_2.5h.yaml for a larger, statistically solid run.
+#
+# Reference (NVIDIA Jetson AGX Thor, Q4_K_M + llama.cpp, reasoning off): ~73 min,
+# 483/483 turns, 0 dropped, inline IoU 0.6189.
+#
+# Run (start the edge server first; see README.md):
+#   inference-endpoint benchmark from-config \
+#     --config online_agentic_coding_1h.yaml
+name: "agentic-coding-perf-1h"
+version: "1.0"
+type: "online"
+
+model_params:
+  name: "Qwen3.6-27B-Q4_K_M" # replace with your served model name
+  # Deterministic decoding so the inline accuracy check is reproducible. Reasoning
+  # is disabled server-side (llama-server --reasoning off): on this tool-calling
+  # workload reasoning gives no inline-IoU benefit and costs ~60% more wall-clock.
+  temperature: 0
+  seed: 42
+  max_new_tokens: 1024
+
+datasets:
+  - name: agentic_coding
+    type: performance
+    path: examples/10_Edge_Agentic_Example/agentic_coding_1h.jsonl
+    accuracy_config:
+      # Inline "online checker": score the replayed performance outputs against
+      # the recorded tool calls (multiset IoU of executables). The dataset is
+      # both the performance workload and its own ground truth.
+      eval_method: agentic_inference_inline
+    agentic_inference:
+      # Per-turn deadline; a timeout aborts that turn and the rest of its
+      # conversation. 10 min is generous for slow edge decode.
+      turn_timeout_s: 600.0
+      # One pass over the 9 conversations (no trajectory repeats).
+      num_trajectories_to_issue: 9
+
+settings:
+  runtime:
+    min_duration_ms: 0
+    # Safety cap (2 h) so the run stays bounded even if decode is slower than
+    # expected; one pass should finish in ~1 h on an edge box.
+    max_duration_ms: 7200000
+  load_pattern:
+    type: agentic_inference
+    # Single-stream: one in-flight conversation at a time, matching a single-slot
+    # edge server (llama.cpp -np 1). Raise only for a multi-slot endpoint.
+    target_concurrency: 1
+  client:
+    num_workers: 1
+    max_connections: 1
+    warmup_connections: 0
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:8080"
+  api_type: openai
+
+report_dir: results/agentic_coding_perf_1h/
@@ -0,0 +1,77 @@
+# Agentic coding performance benchmark — edge device, ~2.5 h subset (inline accuracy)
+#
+# Replays recorded multi-turn agentic-coding trajectories (SWE-bench-style) as a
+# *performance* workload (throughput / TTFT / TPOT / latency) against the same
+# served endpoint used for BFCL v4 accuracy, and runs an inline "online checker"
+# that scores the replayed trajectories against the recorded tool calls in the
+# dataset (accuracy_config.eval_method: agentic_inference_inline). The dataset is
+# therefore both the performance workload and its own ground truth.
+#
+# Dataset: agentic_coding_2.5h.jsonl — 20 deep conversations (1007 generated
+# turns) spread across 9 source repositories, each selected so its peak input
+# sequence length stays under the 32K served context (measured peak ISL ~23.5K,
+# leaving headroom for the 1024-token output cap). No conversation overflows
+# 32K, so every turn completes and the run is *valid* (0 dropped turns). This is
+# the recommended reference performance run: ~2× the sample of the 1 h variant,
+# giving a more stable inline-IoU estimate, sized for a ~2.5 h single-stream
+# pass on a ~10-12 tok/s edge box (e.g. Jetson Thor, Qwen3.6-27B Q4_K_M,
+# reasoning off); with serving optimizations (e.g. MTP speculative decoding)
+# this drops toward ~1.5 h.
+#
+# Reference (NVIDIA Jetson AGX Thor, Q4_K_M + llama.cpp, reasoning off): ~2 h 37 m,
+# 1007/1007 turns, 0 dropped, inline IoU 0.6335.
+#
+# Run (start the edge server first; see README.md):
+#   inference-endpoint benchmark from-config \
+#     --config online_agentic_coding_2.5h.yaml
+name: "agentic-coding-perf-2.5h"
+version: "1.0"
+type: "online"
+
+model_params:
+  name: "Qwen3.6-27B-Q4_K_M" # replace with your served model name
+  # Deterministic decoding so the inline accuracy check is reproducible. Reasoning
+  # is disabled server-side (llama-server --reasoning off): on this tool-calling
+  # workload reasoning gives no inline-IoU benefit and costs ~60% more wall-clock.
+  temperature: 0
+  seed: 42
+  max_new_tokens: 1024
+
+datasets:
+  - name: agentic_coding
+    type: performance
+    path: examples/10_Edge_Agentic_Example/agentic_coding_2.5h.jsonl
+    accuracy_config:
+      # Inline "online checker": score the replayed performance outputs against
+      # the recorded tool calls (multiset IoU of executables). The dataset is
+      # both the performance workload and its own ground truth.
+      eval_method: agentic_inference_inline
+    agentic_inference:
+      # Per-turn deadline; a timeout aborts that turn and the rest of its
+      # conversation. 10 min is generous for slow edge decode.
+      turn_timeout_s: 600.0
+      # One pass over the 20 conversations (no trajectory repeats).
+      num_trajectories_to_issue: 20
+
+settings:
+  runtime:
+    min_duration_ms: 0
+    # Safety cap (4 h) so the run stays bounded even if decode is slower than
+    # expected; one pass should finish in ~2.5 h on an edge box.
+    max_duration_ms: 14400000
+  load_pattern:
+    type: agentic_inference
+    # Single-stream: one in-flight conversation at a time, matching a single-slot
+    # edge server (llama.cpp -np 1). Raise only for a multi-slot endpoint.
+    target_concurrency: 1
+  client:
+    num_workers: 1
+    max_connections: 1
+    warmup_connections: 0
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:8080"
+  api_type: openai
+
+report_dir: results/agentic_coding_perf_2.5h/
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+# Reproduce the BFCL v4 edge-agentic accuracy reference result (~3 h on an edge
+# device). The finalized accuracy benchmark is single-turn only, sampled to ~995
+# samples (see offline_bfcl_v4_single_turn.yaml).
+#
+# Usage:
+#   1. Edit MODEL and ENDPOINT below to match your server.
+#   2. bash run_accuracy.sh
+#
+# Results are written to:
+#   results/bfcl_v4_single_turn_accuracy/   (single-turn)
+#
+# Multi-turn is no longer part of the accuracy gate; see README Step 3 for the
+# optional exploratory multi-turn run.
+
+set -euo pipefail
+
+MODEL="${MODEL:-Qwen3.6-27B-Q4_K_M}"
+ENDPOINT="${ENDPOINT:-http://localhost:8080}"
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR"
+
+echo "=== BFCL v4 edge-agentic accuracy run ==="
+echo "  Model:    $MODEL"
+echo "  Endpoint: $ENDPOINT"
+echo ""
+
+# from-config reads model name and endpoint from the YAML (it has no
+# model/endpoint override flags). Render a temp config with MODEL/ENDPOINT
+# substituted in so the env vars above take effect without editing the
+# committed YAML; the trailing "# set to your ..." comments anchor the edit.
+ST_CONFIG="$(mktemp --suffix=.yaml)"
+trap 'rm -f "$ST_CONFIG"' EXIT
+sed -E \
+    -e "s|^( *name: ).*(# set to your served model name\.)|\1\"${MODEL}\" \2|" \
+    -e "s|^( *- ).*(# set to your endpoint URL\.)|\1\"${ENDPOINT}\" \2|" \
+    offline_bfcl_v4_single_turn.yaml > "$ST_CONFIG"
+
+# Single-turn: non_live (62%), live (10%), hallucination (10%) — ~995 samples, ~3 h
+echo "--- Single-turn (~995 samples, ~3 h) ---"
+inference-endpoint benchmark from-config \
+    --config "$ST_CONFIG" \
+    --accuracy-only
+
+echo ""
+echo "=== Done. Results in results/bfcl_v4_single_turn_accuracy/ ==="
@@ -9,6 +9,16 @@ environments = [
     "sys_platform == 'darwin' and platform_machine == 'x86_64'",
     "sys_platform == 'darwin' and platform_machine == 'arm64'",
 ]
+# bfcl-eval hard-pins an old dependency set (numpy==1.26.4, filelock, etc.).
+# Mark bfcl as conflicting with the tooling extras so uv resolves it in its own
+# fork; otherwise those old pins drag shared dev/test/performance deps (filelock,
+# virtualenv) down to versions with known CVEs. CI installs the tooling extras
+# without bfcl; bfcl is installed standalone (pip install -e ".[bfcl]").
+conflicts = [
+    [{ extra = "bfcl" }, { extra = "dev" }],
+    [{ extra = "bfcl" }, { extra = "test" }],
+    [{ extra = "bfcl" }, { extra = "performance" }],
+]
 
 [tool.uv.build-backend]
 module-root = "src"
@@ -59,7 +69,7 @@ dependencies = [
     "transformers==5.5.0",
     # Required by transformers' apply_chat_template
     "jinja2==3.1.6",
-    "numpy==2.4.4",
+    "numpy>=1.26.4",
     "datasets==4.8.4",
     "Pillow==12.2.0",
     "sentencepiece==0.2.1",
@@ -95,6 +105,13 @@ dev = [
     "myst-parser==5.0.0",
     # Security auditing
     "pip-audit==2.10.0",
+    # bfcl-eval hard-pins filelock==3.20.0, which uv would otherwise share across
+    # every resolution fork. Because bfcl conflicts with this extra (see
+    # [tool.uv].conflicts), these floors force uv to fork filelock/virtualenv:
+    # patched here, pinned only inside the bfcl fork. Closes CVE-2025-68146 /
+    # CVE-2026-22701 (filelock) and CVE-2026-22702 (virtualenv).
+    "filelock>=3.20.3",
+    "virtualenv>=20.36.1",
 ]
 test = [
     # Includes optional dependencies for full test coverage
@@ -122,6 +139,14 @@ performance = [
     "pytest-benchmark==5.2.3",
     "memory-profiler==0.61.0",
 ]
+bfcl = [
+    # BFCL v4 function-calling evaluation. Pins numpy==1.26.4, which is why
+    # the top-level numpy requirement is a lower bound (>=1.26.4).
+    "bfcl-eval==2026.3.23",
+    # bfcl-eval's qwen model handler transitively imports qwen_agent → soundfile;
+    # soundfile is not used by our scorer but must be present for the import to succeed.
+    "soundfile==0.13.1",
+]
 
 [project.scripts]
 inference-endpoint = "inference_endpoint.main:run"