Skip to content
2 changes: 2 additions & 0 deletions tests/benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Professional benchmarks to validate CascadeFlow performance across real-world us
6. **TruthfulQA** - Factual correctness (sampled)
7. **Tool Calling** - Structured tool selection correctness (single + multi-turn)
8. **BFCL Agentic** - Agentic/multi-turn tool-calling patterns (dependencies, chaining)
9. **ToolCalls Real-World** - Tool routing with multi-turn context

#### Metrics

Expand All @@ -20,6 +21,7 @@ Each benchmark measures:
- **Quality maintenance** (accuracy/pass rate)
- **Latency** improvements
- **Escalation rates** (drafter acceptance %)
- **Direct routing** counts and **cascade overhead** latency

#### Running Benchmarks

Expand Down
7 changes: 7 additions & 0 deletions tests/benchmarks/banking77_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,12 +410,19 @@ async def run_cascade(self, query: str) -> dict[str, Any]:
"model_used": result.model_used,
"accepted": result.draft_accepted,
"quality_score": result.quality_score,
"routing_strategy": result.routing_strategy,
"drafter_cost": result.draft_cost,
"verifier_cost": result.verifier_cost,
"total_cost": result.total_cost,
"cost_saved": cost_saved,
"baseline_cost": baseline_cost,
"latency_ms": latency_ms,
"cascadeflow_latency_ms": (
(result.complexity_detection_ms or 0)
+ (result.metadata.get("domain_detection_ms", 0) if result.metadata else 0)
+ (result.metadata.get("tool_complexity_analysis_ms", 0) if result.metadata else 0)
+ (result.quality_verification_ms or 0)
),
"tokens_input": int(prompt_tokens or 0),
"tokens_output": int(completion_tokens or 0),
# Diagnostics for benchmark debug hooks.
Expand Down
69 changes: 62 additions & 7 deletions tests/benchmarks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class BenchmarkResult:
model_used: str # "drafter" or "verifier"
accepted: bool # True if drafter accepted
quality_score: float # 0-1 quality score from verifier
routing_strategy: str # "cascade" or "direct"

# Cost metrics (in USD)
drafter_cost: float
Expand All @@ -31,6 +32,7 @@ class BenchmarkResult:

# Performance metrics
latency_ms: float
cascadeflow_latency_ms: float
tokens_input: int
tokens_output: int

Expand All @@ -46,7 +48,17 @@ class BenchmarkResult:
@property
def escalated(self) -> bool:
"""True if query was escalated to verifier."""
return not self.accepted
return self.verifier_rejected

@property
def direct_routed(self) -> bool:
"""True if query was routed directly (no cascade)."""
return self.routing_strategy == "direct"

@property
def verifier_rejected(self) -> bool:
"""True if draft was rejected and verifier was used."""
return self.routing_strategy == "cascade" and not self.accepted

@property
def cost_savings(self) -> float:
Expand Down Expand Up @@ -98,6 +110,8 @@ class BenchmarkSummary:
escalated_to_verifier: int
acceptance_rate_pct: float
escalation_rate_pct: float
direct_routed: int
direct_routing_pct: float

# Cost metrics
total_cost: float
Expand All @@ -112,11 +126,13 @@ class BenchmarkSummary:
avg_latency_ms: float
median_latency_ms: float
p95_latency_ms: float
avg_cascadeflow_latency_ms: float

# Quality metrics
accuracy: float # Percentage of correct predictions
drafter_accuracy: float # Accuracy when drafter was used
verifier_accuracy: float # Accuracy when verifier was used
direct_accuracy: float # Accuracy when routed directly

# Token usage
total_input_tokens: int
Expand Down Expand Up @@ -290,6 +306,12 @@ async def run(self) -> BenchmarkSummary:
cascade_result["prediction"], ground_truth
)

routing_strategy = cascade_result.get("routing_strategy")
if not routing_strategy:
routing_strategy = (
"direct" if cascade_result.get("direct_routed") else "cascade"
)

tokens_input = cascade_result["tokens_input"]
tokens_output = cascade_result["tokens_output"]
if tokens_input == 0:
Expand Down Expand Up @@ -323,11 +345,13 @@ async def run(self) -> BenchmarkSummary:
model_used=cascade_result["model_used"],
accepted=cascade_result["accepted"],
quality_score=cascade_result["quality_score"],
routing_strategy=routing_strategy,
drafter_cost=cascade_result["drafter_cost"],
verifier_cost=cascade_result["verifier_cost"],
total_cost=cascade_result["total_cost"],
baseline_cost=baseline_cost,
latency_ms=cascade_result["latency_ms"],
cascadeflow_latency_ms=cascade_result.get("cascadeflow_latency_ms", 0.0),
tokens_input=tokens_input,
tokens_output=tokens_output,
ground_truth=ground_truth,
Expand Down Expand Up @@ -364,11 +388,13 @@ async def run(self) -> BenchmarkSummary:
model_used="error",
accepted=False,
quality_score=0.0,
routing_strategy="cascade",
drafter_cost=0.0,
verifier_cost=0.0,
total_cost=0.0,
baseline_cost=0.0,
latency_ms=0.0,
cascadeflow_latency_ms=0.0,
tokens_input=0,
tokens_output=0,
ground_truth=ground_truth,
Expand Down Expand Up @@ -414,6 +440,8 @@ def _generate_summary(self) -> BenchmarkSummary:
escalated_to_verifier=0,
acceptance_rate_pct=0.0,
escalation_rate_pct=0.0,
direct_routed=0,
direct_routing_pct=0.0,
total_cost=0.0,
effective_total_cost=0.0,
total_baseline_cost=0.0,
Expand All @@ -424,9 +452,11 @@ def _generate_summary(self) -> BenchmarkSummary:
avg_latency_ms=0.0,
median_latency_ms=0.0,
p95_latency_ms=0.0,
avg_cascadeflow_latency_ms=0.0,
accuracy=0.0,
drafter_accuracy=0.0,
verifier_accuracy=0.0,
direct_accuracy=0.0,
total_input_tokens=0,
total_output_tokens=0,
avg_input_tokens=0.0,
Expand All @@ -452,8 +482,12 @@ def _generate_summary(self) -> BenchmarkSummary:
result.baseline_cost = max(result.baseline_cost, baseline_cost)

# Cascade metrics
drafter_accepted = sum(1 for r in valid_results if r.accepted)
escalated = sum(1 for r in valid_results if r.escalated)
direct_routed = sum(1 for r in valid_results if r.direct_routed)
drafter_accepted = sum(
1 for r in valid_results if r.routing_strategy == "cascade" and r.accepted
)
escalated = sum(1 for r in valid_results if r.verifier_rejected)
cascade_total = drafter_accepted + escalated

# Cost metrics
total_cost = sum(r.total_cost for r in valid_results)
Expand All @@ -473,23 +507,35 @@ def _generate_summary(self) -> BenchmarkSummary:
median_latency = latencies[len(latencies) // 2]
p95_idx = int(len(latencies) * 0.95)
p95_latency = latencies[p95_idx]
cascadeflow_latencies = [r.cascadeflow_latency_ms for r in valid_results]
avg_cascadeflow_latency = (
sum(cascadeflow_latencies) / len(cascadeflow_latencies)
if cascadeflow_latencies
else 0.0
)

# Quality metrics
correct = sum(1 for r in valid_results if r.is_correct)
accuracy = (correct / len(valid_results) * 100) if valid_results else 0.0

drafter_results = [r for r in valid_results if r.accepted]
drafter_results = [
r for r in valid_results if r.routing_strategy == "cascade" and r.accepted
]
drafter_correct = sum(1 for r in drafter_results if r.is_correct)
drafter_accuracy = (
(drafter_correct / len(drafter_results) * 100) if drafter_results else 0.0
)

verifier_results = [r for r in valid_results if r.escalated]
verifier_results = [r for r in valid_results if r.verifier_rejected]
verifier_correct = sum(1 for r in verifier_results if r.is_correct)
verifier_accuracy = (
(verifier_correct / len(verifier_results) * 100) if verifier_results else 0.0
)

direct_results = [r for r in valid_results if r.direct_routed]
direct_correct = sum(1 for r in direct_results if r.is_correct)
direct_accuracy = (direct_correct / len(direct_results) * 100) if direct_results else 0.0

# Token usage
total_input = sum(r.tokens_input for r in valid_results)
total_output = sum(r.tokens_output for r in valid_results)
Expand All @@ -501,8 +547,12 @@ def _generate_summary(self) -> BenchmarkSummary:
failed_tests=failed,
drafter_accepted=drafter_accepted,
escalated_to_verifier=escalated,
acceptance_rate_pct=(drafter_accepted / successful * 100) if successful > 0 else 0.0,
escalation_rate_pct=(escalated / successful * 100) if successful > 0 else 0.0,
acceptance_rate_pct=(
(drafter_accepted / cascade_total * 100) if cascade_total > 0 else 0.0
),
escalation_rate_pct=(escalated / cascade_total * 100) if cascade_total > 0 else 0.0,
direct_routed=direct_routed,
direct_routing_pct=(direct_routed / successful * 100) if successful > 0 else 0.0,
total_cost=total_cost,
effective_total_cost=effective_total_cost,
total_baseline_cost=total_baseline,
Expand All @@ -513,9 +563,11 @@ def _generate_summary(self) -> BenchmarkSummary:
avg_latency_ms=avg_latency,
median_latency_ms=median_latency,
p95_latency_ms=p95_latency,
avg_cascadeflow_latency_ms=avg_cascadeflow_latency,
accuracy=accuracy,
drafter_accuracy=drafter_accuracy,
verifier_accuracy=verifier_accuracy,
direct_accuracy=direct_accuracy,
total_input_tokens=total_input,
total_output_tokens=total_output,
avg_input_tokens=total_input / successful if successful > 0 else 0.0,
Expand All @@ -540,6 +592,7 @@ def _print_summary(self, summary: BenchmarkSummary) -> None:
print(
f" Escalated: {summary.escalated_to_verifier} ({summary.escalation_rate_pct:.1f}%)"
)
print(f" Direct Routed: {summary.direct_routed} ({summary.direct_routing_pct:.1f}%)")

print("\nCOST ANALYSIS:")
print(f" Total Cost: ${summary.total_cost:.6f}")
Expand All @@ -554,11 +607,13 @@ def _print_summary(self, summary: BenchmarkSummary) -> None:
print(f" Avg Latency: {summary.avg_latency_ms:.0f}ms")
print(f" Median Latency: {summary.median_latency_ms:.0f}ms")
print(f" P95 Latency: {summary.p95_latency_ms:.0f}ms")
print(f" Avg Cascade Overhead:{summary.avg_cascadeflow_latency_ms:.0f}ms")

print("\nQUALITY:")
print(f" Overall Accuracy: {summary.accuracy:.1f}%")
print(f" Drafter Accuracy: {summary.drafter_accuracy:.1f}%")
print(f" Verifier Accuracy: {summary.verifier_accuracy:.1f}%")
print(f" Direct Accuracy: {summary.direct_accuracy:.1f}%")

print("\nTOKEN USAGE:")
print(f" Total Input: {summary.total_input_tokens:,}")
Expand Down
50 changes: 42 additions & 8 deletions tests/benchmarks/bfcl/bfcl_full_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))

from cascadeflow import CascadeAgent, DomainConfig, ModelConfig
from tests.benchmarks.utils import resolve_model_cost, resolve_model_pair, resolve_model_provider


@dataclass
Expand Down Expand Up @@ -381,16 +382,31 @@ def _check_function_correct(
response: str,
expected_func: Optional[str],
expected_params: Optional[dict] = None,
expected_funcs: Optional[list[str]] = None,
) -> tuple[bool, bool]:
"""Check if function call is correct."""
response_lower = response.lower()

if expected_funcs:
counts = {}
for func in expected_funcs:
func_key = func.lower()
counts[func_key] = counts.get(func_key, 0) + 1

for func, expected_count in counts.items():
func_mentions = response_lower.count(func) + response_lower.count(
func.replace("_", " ")
)
if func_mentions < expected_count:
return False, True
return True, True

found_func, found_params = self._extract_function_call(response)

if expected_func is None:
# No tool should be used
func_correct = (
found_func is None
or "don't need" in response.lower()
or "no tool" in response.lower()
found_func is None or "don't need" in response_lower or "no tool" in response_lower
)
return func_correct, True

Expand All @@ -414,6 +430,7 @@ async def run_single(self, task: dict) -> BFCLResult:
tools = task["tools"]
prompt = task["prompt"]
expected_func = task.get("expected_function")
expected_funcs = task.get("expected_functions")
expected_params = task.get("expected_params")

# Format tools for prompt
Expand All @@ -436,10 +453,23 @@ async def run_single(self, task: dict) -> BFCLResult:
User request: {prompt}"""

# Create agent
drafter_provider = resolve_model_provider(self.drafter_model)
verifier_provider = resolve_model_provider(self.verifier_model)
drafter_cost = resolve_model_cost(self.drafter_model, 0.00015)
verifier_cost = resolve_model_cost(self.verifier_model, 0.0025)

agent = CascadeAgent(
models=[
ModelConfig(name=self.drafter_model, provider="openai", cost=0.00015),
ModelConfig(name=self.verifier_model, provider="openai", cost=0.0025),
ModelConfig(
name=self.drafter_model,
provider=drafter_provider,
cost=drafter_cost,
),
ModelConfig(
name=self.verifier_model,
provider=verifier_provider,
cost=verifier_cost,
),
],
enable_domain_detection=True,
use_semantic_domains=True,
Expand All @@ -452,7 +482,10 @@ async def run_single(self, task: dict) -> BFCLResult:
latency_ms = (time.time() - start_time) * 1000

func_correct, params_correct = self._check_function_correct(
result.content, expected_func, expected_params
result.content,
expected_func,
expected_params,
expected_funcs,
)

found_func, _ = self._extract_function_call(result.content)
Expand Down Expand Up @@ -572,8 +605,9 @@ async def main():
parser = argparse.ArgumentParser(description="BFCL-style Function Calling Benchmark")
parser.add_argument("--sample", type=int, help="Run N tasks")
parser.add_argument("--full", action="store_true", help="Run all tasks")
parser.add_argument("--drafter", default="gpt-4o-mini")
parser.add_argument("--verifier", default="gpt-4o")
default_drafter, default_verifier = resolve_model_pair("gpt-4o-mini", "gpt-4o")
parser.add_argument("--drafter", default=default_drafter)
parser.add_argument("--verifier", default=default_verifier)

args = parser.parse_args()

Expand Down
7 changes: 7 additions & 0 deletions tests/benchmarks/customer_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,12 +522,19 @@ def _provider_for(model_name: str) -> str:
"model_used": result.model_used,
"accepted": result.draft_accepted,
"quality_score": result.quality_score or 0.0,
"routing_strategy": result.routing_strategy,
"drafter_cost": result.draft_cost or 0.0,
"verifier_cost": result.verifier_cost or 0.0,
"total_cost": result.total_cost,
"cost_saved": cost_saved,
"baseline_cost": baseline_cost,
"latency_ms": result.latency_ms,
"cascadeflow_latency_ms": (
(result.complexity_detection_ms or 0)
+ (result.metadata.get("domain_detection_ms", 0) if result.metadata else 0)
+ (result.metadata.get("tool_complexity_analysis_ms", 0) if result.metadata else 0)
+ (result.quality_verification_ms or 0)
),
"tokens_input": int(result.metadata.get("prompt_tokens") or 0),
"tokens_output": int(result.metadata.get("completion_tokens") or 0),
# Diagnostic fields (used only by Benchmark.on_result hooks if present).
Expand Down
6 changes: 6 additions & 0 deletions tests/benchmarks/gsm8k/gsm8k.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,12 @@ def _provider_for(model_name: str) -> str:
"cost_saved": cost_saved,
"baseline_cost": baseline_cost, # For accurate savings calculation
"latency_ms": latency_ms,
"cascadeflow_latency_ms": (
(result.complexity_detection_ms or 0)
+ (result.metadata.get("domain_detection_ms", 0) if result.metadata else 0)
+ (result.metadata.get("tool_complexity_analysis_ms", 0) if result.metadata else 0)
+ (result.quality_verification_ms or 0)
),
"tokens_input": prompt_tokens,
"tokens_output": completion_tokens,
"routing_strategy": routing_strategy,
Expand Down
Loading
Loading