From 62a8d474d9ec5c975f0ce77552e54389d91c6cd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Titsworth-Morin?= Date: Tue, 5 May 2026 23:32:27 +0000 Subject: [PATCH 1/3] research: evaluate GPT-4.1 Mini via SAM AI Gateway for harness fallback Run focused experiment against gpt-4.1-mini through SAM's existing AI Gateway (/openai path with cf-aig-authorization / Unified Billing). Compare tool-call behavior, token efficiency, latency, and response shape against the merged Gemma 4 26B baseline. Key findings: - Two-tool loop PASS with tool_choice: "auto" (zero workarounds) - 1.9x more token-efficient than Gemma 4 (606 vs 1,159 tokens) - ~1.5x faster latency (~2.6s vs ~4.0s total) - No reasoning field (unlike Gemma's free CoT traces) - gpt-4.1-nano NOT recommended (duplicate tool calls observed) - Validates existing SAM proxy code path works without changes - Unified Billing required (cf-aig-authorization header) Recommendation: keep Gemma 4 as free default, add GPT-4.1 Mini as paid fallback tier for latency/efficiency-sensitive workloads. Co-Authored-By: Claude Opus 4.6 --- .../ai-gateway-tool-call/FINDINGS-openai.md | 259 ++++++++++++++++++ 1 file changed, 259 insertions(+) create mode 100644 experiments/ai-gateway-tool-call/FINDINGS-openai.md diff --git a/experiments/ai-gateway-tool-call/FINDINGS-openai.md b/experiments/ai-gateway-tool-call/FINDINGS-openai.md new file mode 100644 index 000000000..fff2aeedb --- /dev/null +++ b/experiments/ai-gateway-tool-call/FINDINGS-openai.md @@ -0,0 +1,259 @@ +# GPT-4.1 Mini Harness Evaluation: Findings + +**Date**: 2026-05-05 +**Branch**: `sam/use-skill-continue-sam-01kqx7` +**Gateway**: SAM AI Gateway (`sam`) via `/openai` path +**Previous experiment**: `FINDINGS-gemma.md` (2026-05-05, Gemma 4 26B baseline) + +## Summary + +GPT-4.1 Mini (`gpt-4.1-mini`) is a **strong alternative** to Gemma 4 26B for SAM's harness tool-calling workloads. It passes all tests with zero workarounds, produces the most token-efficient responses, and has the fastest latency. However, it costs real money (not free-tier), does not produce reasoning traces, and routes through Cloudflare's Unified Billing (requires `cf-aig-authorization`). It is recommended as the **paid fallback** when higher quality or faster response times justify the cost over the free Gemma 4 26B default. + +GPT-4.1 Nano (`gpt-4.1-nano`) exhibited tool-call quality issues (duplicate calls) and is **not recommended** for harness use without further evaluation. + +## Test Environment + +- **Gateway endpoint**: `https://gateway.ai.cloudflare.com/v1/{account_id}/sam/openai/v1/chat/completions` +- **Auth**: `cf-aig-authorization: Bearer {CF_TOKEN}` (Unified Billing — Cloudflare credits cover OpenAI inference) +- **Cost**: Pay-per-token via Cloudflare Unified Billing (see pricing below) +- **Metadata**: `cf-aig-metadata` header with userId, workspaceId, projectId, source, modelId — same schema as Gemma experiment + +## Detailed Findings + +### 1. Two-Tool Loop: PASS (tool_choice: "auto") + +GPT-4.1 Mini completes the get_weather -> calculate -> final answer loop using `tool_choice: "auto"` in exactly 3 turns, same as Gemma 4. + +**Flow (3 turns, 3 API calls)**: + +| Turn | Request | Response | +|------|---------|----------| +| 1 | User: "Weather in Paris, temp in Celsius" | `tool_calls: [get_weather({city: "Paris"})]`, `finish_reason: "tool_calls"` | +| 2 | Tool result: `{temperature_f: 72, condition: "sunny"}` | `tool_calls: [calculate({expression: "(72 - 32) * 5/9"})]`, `finish_reason: "tool_calls"` | +| 3 | Tool result: `{result: 22.2222}` | `content: "The weather in Paris is sunny at 72F (~22.2C)"`, `finish_reason: "stop"` | + +**Total tokens**: 153 + 204 + 249 = 606 across 3 turns. +**Total latency**: ~2.6s (3 calls averaging ~870ms each). + +### 2. content: null Handling: PASS (native format) + +`content: null` is OpenAI's own specification for assistant messages with tool_calls. It works without issue — this is the canonical format, not an edge case. + +### 3. No Reasoning Field + +Unlike Gemma 4 which returns a `reasoning` field with chain-of-thought, GPT-4.1 Mini returns only the `content` and `tool_calls` fields. No built-in observability of decision-making process. + +GPT-4.1 Mini does return `annotations: []` and `refusal: null` fields in the message, plus detailed `usage.prompt_tokens_details` and `usage.completion_tokens_details` breakdowns (cached tokens, reasoning tokens, audio tokens). + +### 4. Harness-Style Coding Tools: PASS + +Given "Find processOrder and add error handling for negative total" with grep, read_file, edit_file, bash tools: + +- GPT-4.1 Mini correctly chose `grep({pattern: "function processOrder"})` as the first action +- Single tool call per turn (no unnecessary parallel calls) +- Appropriate tool selection (grep first, not bash or blind edit) + +### 5. GPT-4.1 Nano: NOT RECOMMENDED + +Tested `gpt-4.1-nano` as a potential ultra-cheap option: + +- **Turn 1 failure**: Called `get_weather({city: "Paris"})` **twice** in the same response (duplicate tool_calls array entries) +- This would cause double-execution in a real harness loop +- The model appears to have tool-call quality issues at this size tier +- **Verdict**: Not suitable for harness/orchestrator workloads without significant retry/dedup logic + +### 6. Comparative Analysis + +| Capability | GPT-4.1 Mini | Gemma 4 26B | Qwen 2.5 Coder 32B | +|---|---|---|---| +| **Structured tool_calls with `auto`** | Yes | Yes | **No** (text content) | +| **`content: null` in messages** | Works (native) | Works | **Rejected** | +| **Reasoning/CoT field** | None | `reasoning` | None | +| **Two-tool loop** | PASS | PASS | PASS (with workarounds) | +| **Tokens (2-tool task)** | **606** | 1,159 | N/A | +| **Latency (2-tool task)** | **~2.6s** | ~4.0s | ~3.0s | +| **Workarounds needed** | **None** | **None** | 2 | +| **Context window** | 1M | 32K | 32K | +| **Cost** | Paid (Unified Billing) | $0 (Workers AI) | $0 (Workers AI) | +| **Observability** | usage_details only | reasoning field | None | +| **Duplicate tool calls** | No | No | No | + +### 7. Token Efficiency + +GPT-4.1 Mini is dramatically more token-efficient than Gemma 4: + +| Metric | GPT-4.1 Mini | Gemma 4 26B | Ratio | +|--------|-------------|-------------|-------| +| Turn 1 total tokens | 153 | 386 | **2.5x fewer** | +| Turn 2 total tokens | 204 | 371 | **1.8x fewer** | +| Turn 3 total tokens | 249 | 402 | **1.6x fewer** | +| **Total (full loop)** | **606** | **1,159** | **1.9x fewer** | + +This efficiency comes from GPT-4.1 Mini's smaller prompt overhead and more concise completions (14 tokens for a tool call vs Gemma's 60+). + +### 8. Latency + +| Model | Turn 1 | Turn 2 | Turn 3 | Total | +|-------|--------|--------|--------|-------| +| GPT-4.1 Mini | 0.90s | 0.91s | 0.82s | **2.63s** | +| Gemma 4 26B | ~1.3s | ~1.3s | ~1.4s | **~4.0s** | + +GPT-4.1 Mini is ~1.5x faster end-to-end. + +## Auth Path (SAM Proxy Alignment) + +``` +SAM Proxy (POST /ai/v1/chat/completions) + -> resolves model to openai provider (gpt-* prefix) + -> buildOpenAIUrl() -> gateway.ai.cloudflare.com/v1/{account}/sam/openai/v1/chat/completions + -> cf-aig-authorization: Bearer {CF_AIG_TOKEN ?? CF_API_TOKEN} (Unified Billing) + -> cf-aig-metadata: {userId, workspaceId, projectId, source, modelId} +``` + +This is the exact path SAM's `forwardToOpenAI()` function uses in `apps/api/src/routes/ai-proxy.ts`. The experiment validates the full production path. + +**Key difference from Workers AI path**: OpenAI uses `cf-aig-authorization` (Unified Billing header) rather than `Authorization: Bearer` (standard API token). This means Unified Billing must be enabled on the Cloudflare account for OpenAI models to work through the SAM gateway. + +## Request/Response Shapes + +### Tool Call Request (Identical to OpenAI native — no workarounds) + +```json +{ + "model": "gpt-4.1-mini", + "messages": [ + {"role": "system", "content": "You are a coding agent..."}, + {"role": "user", "content": "Find processOrder and add error handling..."} + ], + "tools": [ + { + "type": "function", + "function": { + "name": "grep", + "description": "Search files for a pattern...", + "parameters": {"type": "object", "properties": {"pattern": {"type": "string"}}, "required": ["pattern"]} + } + } + ], + "tool_choice": "auto" +} +``` + +**Headers**: +``` +cf-aig-authorization: Bearer {CF_TOKEN} +Content-Type: application/json +cf-aig-metadata: {"userId":"...","workspaceId":"...","projectId":"...","source":"...","modelId":"gpt-4.1-mini"} +``` + +### Tool Call Response + +```json +{ + "id": "chatcmpl-DcJTRIX23DVL1ceAyRxz8FbOWKdWU", + "object": "chat.completion", + "created": 1778023693, + "model": "gpt-4.1-mini-2025-04-14", + "choices": [{ + "index": 0, + "message": { + "role": "assistant", + "content": null, + "tool_calls": [{ + "id": "call_AuFaJeP9vTrbZF3Eu461HEfc", + "type": "function", + "function": { + "name": "grep", + "arguments": "{\"pattern\":\"function processOrder\"}" + } + }], + "refusal": null, + "annotations": [] + }, + "logprobs": null, + "finish_reason": "tool_calls" + }], + "usage": { + "prompt_tokens": 236, + "completion_tokens": 16, + "total_tokens": 252, + "prompt_tokens_details": {"cached_tokens": 0, "audio_tokens": 0}, + "completion_tokens_details": {"reasoning_tokens": 0, "audio_tokens": 0, "accepted_prediction_tokens": 0, "rejected_prediction_tokens": 0} + }, + "service_tier": "default", + "system_fingerprint": "fp_34570b7b86" +} +``` + +### Notable Response Shape Differences vs Gemma 4 + +| Field | GPT-4.1 Mini | Gemma 4 26B | +|-------|-------------|-------------| +| `message.reasoning` | Absent | Present (free CoT) | +| `message.refusal` | Present (`null`) | Absent | +| `message.annotations` | Present (`[]`) | Absent | +| `usage.prompt_tokens_details` | Detailed breakdown | Minimal | +| `usage.completion_tokens_details` | Detailed breakdown | Absent | +| `service_tier` | Present (`"default"`) | Absent | +| `system_fingerprint` | Present | Absent | +| Tool call ID format | `call_<26-char alphanum>` | `chatcmpl-tool-<16-char hex>` | + +## Pricing (Cloudflare Unified Billing) + +Pricing via Cloudflare Unified Billing (as of 2026-05-05): + +| Model | Input (per 1M tokens) | Output (per 1M tokens) | Cost for 2-tool loop (606 tokens) | +|-------|----------------------|------------------------|-----------------------------------| +| GPT-4.1 Mini | ~$0.40 | ~$1.60 | ~$0.0001 | +| GPT-4.1 Nano | ~$0.10 | ~$0.40 | ~$0.00003 | +| Gemma 4 26B | $0 | $0 | **$0** (Workers AI free tier) | + +## Implications for SAM Harness + +### Model Selection Strategy + +``` +Default (free, good quality): @cf/google/gemma-4-26b-a4b-it (Workers AI) +Paid fallback (fast, efficient): gpt-4.1-mini (OpenAI via Unified Billing) +``` + +### When to Use GPT-4.1 Mini Over Gemma 4 + +1. **Latency-sensitive operations** — GPT-4.1 Mini is ~1.5x faster per turn +2. **Long context tasks** — 1M token window vs Gemma's 32K +3. **Token budget constraints** — uses ~1.9x fewer tokens per loop (fewer billable Workers AI neurons, even though GPT costs money) +4. **When the user has Unified Billing enabled** — the cost per task is negligible (~$0.0001 per tool-call loop) + +### When to Prefer Gemma 4 + +1. **Free tier / cost-zero requirement** — Gemma 4 is completely free on Workers AI +2. **Observability needs** — Gemma's `reasoning` field provides free chain-of-thought logging +3. **No Unified Billing configured** — the OpenAI path requires `CF_AIG_TOKEN` or `CF_API_TOKEN` with billing scope +4. **Self-hosted deployments** — Workers AI requires only a standard Cloudflare token, not OpenAI billing + +### No Code Changes Needed + +GPT-4.1 Mini works through the existing SAM proxy code path (`forwardToOpenAI()` in `ai-proxy.ts`) without any modifications. The `isOpenAIModel()` function already recognizes `gpt-*` prefixes, and `resolveUnifiedBillingToken()` already handles the `cf-aig-authorization` header. + +### Model Registry Consideration + +GPT-4.1 Mini could be added to the model registry with: +- `toolCallSupport: 'native'` (OpenAI's own format, zero workarounds) +- `intendedRole: 'workspace-agent-paid'` or a new tier +- `contextWindow: 1000000` +- `provider: 'openai'` + +## Recommendations + +1. **Keep Gemma 4 26B as default harness model.** Free, good quality, reasoning traces. The cost advantage is decisive for a platform that runs many agent loops. + +2. **Add GPT-4.1 Mini as a configurable paid alternative.** When users or the platform want faster/more-efficient responses and are willing to pay, GPT-4.1 Mini is the clear choice. Add it to the model registry. + +3. **Do NOT use GPT-4.1 Nano for harness work.** Duplicate tool calls indicate insufficient quality for autonomous agent loops. + +4. **Unified Billing is the gate.** The OpenAI path requires `cf-aig-authorization` with a billing-capable token. Self-hosters without Unified Billing cannot use this path — they fall back to Gemma 4 (free) or provide their own OpenAI API key via platform credentials. + +5. **Consider a model tier system in the harness config:** + - Tier 0 (free): `@cf/google/gemma-4-26b-a4b-it` — default, always available + - Tier 1 (cheap): `gpt-4.1-mini` — fast, efficient, requires Unified Billing + - Tier 2 (premium): `claude-haiku-4-5` — highest quality, most expensive (also passed the test) From 58b8debf08f74e6714483d1cf6776ba71e713a75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Titsworth-Morin?= Date: Wed, 6 May 2026 02:12:46 +0000 Subject: [PATCH 2/3] fix: correct Workers AI harness pricing metadata --- apps/api/tests/unit/routes/ai-proxy.test.ts | 19 +++--- apps/web/src/lib/api/admin.ts | 2 +- apps/web/src/pages/AdminAIProxy.tsx | 18 +++--- .../playwright/admin-ai-proxy-audit.spec.ts | 14 ++--- .../architecture/agent-harness-integration.md | 2 +- .../ai-gateway-tool-call/FINDINGS-gemma.md | 8 +-- .../ai-gateway-tool-call/FINDINGS-openai.md | 34 ++++++----- experiments/ai-gateway-tool-call/FINDINGS.md | 10 ++-- packages/shared/src/constants/ai-services.ts | 58 +++++++++---------- .../tests/unit/ai-model-registry.test.ts | 38 ++++++------ 10 files changed, 102 insertions(+), 101 deletions(-) diff --git a/apps/api/tests/unit/routes/ai-proxy.test.ts b/apps/api/tests/unit/routes/ai-proxy.test.ts index 9c634e41d..d1cddb958 100644 --- a/apps/api/tests/unit/routes/ai-proxy.test.ts +++ b/apps/api/tests/unit/routes/ai-proxy.test.ts @@ -295,15 +295,15 @@ describe('PLATFORM_AI_MODELS catalog', () => { it('has correct tier assignments', async () => { const { PLATFORM_AI_MODELS } = await import('@simple-agent-manager/shared'); - const freeModels = PLATFORM_AI_MODELS.filter((m) => m.tier === 'free'); + const lowCostModels = PLATFORM_AI_MODELS.filter((m) => m.tier === 'low-cost'); const standardModels = PLATFORM_AI_MODELS.filter((m) => m.tier === 'standard'); const premiumModels = PLATFORM_AI_MODELS.filter((m) => m.tier === 'premium'); - // All Workers AI models are free tier - for (const m of freeModels) { + // Low-cost models route through Cloudflare-billed Workers AI. + for (const m of lowCostModels) { expect(m.provider).toBe('workers-ai'); - expect(m.costPer1kInputTokens).toBe(0); - expect(m.costPer1kOutputTokens).toBe(0); + expect(m.costPer1kInputTokens).toBeGreaterThan(0); + expect(m.costPer1kOutputTokens).toBeGreaterThan(0); } // Standard tier has at least Haiku and GPT-4.1 @@ -328,13 +328,11 @@ describe('PLATFORM_AI_MODELS catalog', () => { expect(providers.has('openai')).toBe(true); }); - it('has positive cost for non-free models', async () => { + it('has positive cost metadata for all catalog models', async () => { const { PLATFORM_AI_MODELS } = await import('@simple-agent-manager/shared'); for (const m of PLATFORM_AI_MODELS) { - if (m.tier !== 'free') { - expect(m.costPer1kInputTokens).toBeGreaterThan(0); - expect(m.costPer1kOutputTokens).toBeGreaterThan(0); - } + expect(m.costPer1kInputTokens).toBeGreaterThan(0); + expect(m.costPer1kOutputTokens).toBeGreaterThan(0); } }); @@ -345,4 +343,3 @@ describe('PLATFORM_AI_MODELS catalog', () => { } }); }); - diff --git a/apps/web/src/lib/api/admin.ts b/apps/web/src/lib/api/admin.ts index 1e1cbf8a5..03be11e45 100644 --- a/apps/web/src/lib/api/admin.ts +++ b/apps/web/src/lib/api/admin.ts @@ -347,7 +347,7 @@ export interface AIProxyConfigResponse { id: string; label: string; provider: 'workers-ai' | 'anthropic' | 'openai'; - tier: 'free' | 'standard' | 'premium'; + tier: 'low-cost' | 'standard' | 'premium'; costPer1kInputTokens: number; costPer1kOutputTokens: number; isDefault?: boolean; diff --git a/apps/web/src/pages/AdminAIProxy.tsx b/apps/web/src/pages/AdminAIProxy.tsx index b11823aca..d7cea88ca 100644 --- a/apps/web/src/pages/AdminAIProxy.tsx +++ b/apps/web/src/pages/AdminAIProxy.tsx @@ -30,13 +30,13 @@ const BILLING_MODE_OPTIONS: Array<{ value: BillingMode; label: string; descripti ]; const TIER_LABELS: Record = { - free: 'Free Tier', + 'low-cost': 'Low Cost', standard: 'Standard', premium: 'Premium', }; const TIER_ORDER: Record = { - free: 0, + 'low-cost': 0, standard: 1, premium: 2, }; @@ -48,15 +48,15 @@ const PROVIDER_LABELS: Record = { }; function formatCost(cost: number): string { - if (cost === 0) return 'Free'; + if (cost === 0) return '$0.0000'; if (cost < 0.001) return `$${cost.toFixed(4)}`; return `$${cost.toFixed(3)}`; } function tierBadgeClasses(tier: string): string { switch (tier) { - case 'free': - return 'bg-green-100 text-green-700 dark:bg-green-900/30 dark:text-green-400'; + case 'low-cost': + return 'bg-emerald-100 text-emerald-700 dark:bg-emerald-900/30 dark:text-emerald-400'; case 'standard': return 'bg-blue-100 text-blue-700 dark:bg-blue-900/30 dark:text-blue-400'; case 'premium': @@ -166,8 +166,8 @@ export function AdminAIProxy() {
Configure the default AI model and billing mode for the platform inference proxy. Models are routed - through Cloudflare AI Gateway. Workers AI models are free; Anthropic and OpenAI models - require credentials or Unified Billing. + through Cloudflare AI Gateway. Workers AI models are Cloudflare-billed; Anthropic and OpenAI + models require credentials or Unified Billing. {error && ( @@ -265,7 +265,7 @@ export function AdminAIProxy() { ))}

- Workers AI models are free. Anthropic and OpenAI models require credentials on the{' '} + Workers AI models bill through Cloudflare. Anthropic and OpenAI models require credentials on the{' '} Credentials {' '} @@ -337,7 +337,7 @@ export function AdminAIProxy() { {formatCost(model.costPer1kOutputTokens)}/1K out ) : ( - Free + No metered cost )} {!model.available && ( diff --git a/apps/web/tests/playwright/admin-ai-proxy-audit.spec.ts b/apps/web/tests/playwright/admin-ai-proxy-audit.spec.ts index 6b95bfae9..574177bfd 100644 --- a/apps/web/tests/playwright/admin-ai-proxy-audit.spec.ts +++ b/apps/web/tests/playwright/admin-ai-proxy-audit.spec.ts @@ -33,14 +33,14 @@ function makeConfig(overrides: Record = {}) { hasOpenAICredential: false, hasUnifiedBilling: false, models: [ - { id: '@cf/meta/llama-4-scout-17b-16e-instruct', label: 'Llama 4 Scout 17B', provider: 'workers-ai', tier: 'free', costPer1kInputTokens: 0, costPer1kOutputTokens: 0, isDefault: true, available: true }, - { id: '@cf/qwen/qwen3-30b-a3b-fp8', label: 'Qwen3 30B', provider: 'workers-ai', tier: 'free', costPer1kInputTokens: 0, costPer1kOutputTokens: 0, available: true }, - { id: '@cf/google/gemma-3-12b-it', label: 'Gemma 3 12B', provider: 'workers-ai', tier: 'free', costPer1kInputTokens: 0, costPer1kOutputTokens: 0, available: true }, - { id: 'claude-haiku-4-5-20251001', label: 'Claude Haiku 4.5', provider: 'anthropic', tier: 'standard', costPer1kInputTokens: 0.0008, costPer1kOutputTokens: 0.004, available: false }, + { id: '@cf/meta/llama-4-scout-17b-16e-instruct', label: 'Llama 4 Scout 17B', provider: 'workers-ai', tier: 'low-cost', costPer1kInputTokens: 0.00027, costPer1kOutputTokens: 0.00085, isDefault: true, available: true }, + { id: '@cf/qwen/qwen3-30b-a3b-fp8', label: 'Qwen3 30B', provider: 'workers-ai', tier: 'low-cost', costPer1kInputTokens: 0.000051, costPer1kOutputTokens: 0.000335, available: true }, + { id: '@cf/google/gemma-3-12b-it', label: 'Gemma 3 12B', provider: 'workers-ai', tier: 'low-cost', costPer1kInputTokens: 0.00035, costPer1kOutputTokens: 0.00056, available: true }, + { id: 'claude-haiku-4-5-20251001', label: 'Claude Haiku 4.5', provider: 'anthropic', tier: 'standard', costPer1kInputTokens: 0.001, costPer1kOutputTokens: 0.005, available: false }, { id: 'claude-sonnet-4-6', label: 'Claude Sonnet 4.6', provider: 'anthropic', tier: 'standard', costPer1kInputTokens: 0.003, costPer1kOutputTokens: 0.015, available: false }, { id: 'gpt-4.1-mini', label: 'GPT-4.1 Mini', provider: 'openai', tier: 'standard', costPer1kInputTokens: 0.0004, costPer1kOutputTokens: 0.0016, available: false }, { id: 'gpt-4.1', label: 'GPT-4.1', provider: 'openai', tier: 'standard', costPer1kInputTokens: 0.002, costPer1kOutputTokens: 0.008, available: false }, - { id: 'claude-opus-4-6', label: 'Claude Opus 4.6', provider: 'anthropic', tier: 'premium', costPer1kInputTokens: 0.015, costPer1kOutputTokens: 0.075, available: false }, + { id: 'claude-opus-4-6', label: 'Claude Opus 4.6', provider: 'anthropic', tier: 'premium', costPer1kInputTokens: 0.005, costPer1kOutputTokens: 0.025, available: false }, { id: 'gpt-5.2', label: 'GPT-5.2', provider: 'openai', tier: 'premium', costPer1kInputTokens: 0.01, costPer1kOutputTokens: 0.04, available: false }, ], ...overrides, @@ -83,7 +83,7 @@ async function screenshot(page: Page, name: string) { test.describe('AdminAIProxy — Mobile', () => { test.use({ viewport: { width: 375, height: 667 }, isMobile: true }); - test('normal data — free tier default', async ({ page }) => { + test('normal data — low-cost Workers AI default', async ({ page }) => { await setupApiMocks(page); await page.goto('/admin/ai-proxy'); await screenshot(page, 'admin-ai-proxy-normal-mobile'); @@ -148,7 +148,7 @@ test.describe('AdminAIProxy — Mobile', () => { test.describe('AdminAIProxy — Desktop', () => { test.use({ viewport: { width: 1280, height: 800 }, isMobile: false }); - test('normal data — free tier default', async ({ page }) => { + test('normal data — low-cost Workers AI default', async ({ page }) => { await setupApiMocks(page); await page.goto('/admin/ai-proxy'); await screenshot(page, 'admin-ai-proxy-normal-desktop'); diff --git a/docs/architecture/agent-harness-integration.md b/docs/architecture/agent-harness-integration.md index 3874c7736..34128d799 100644 --- a/docs/architecture/agent-harness-integration.md +++ b/docs/architecture/agent-harness-integration.md @@ -184,7 +184,7 @@ Gemma 4 26B is the current recommended Workers AI model for harness/orchestrator - Produces structured `tool_calls` with `tool_choice: "auto"` (no forcing required) - Handles OpenAI-format `content: null` without workarounds - Returns built-in `reasoning` field for observability -- Runs on the Workers AI free tier +- Runs through Cloudflare-billed Workers AI at low per-token rates - Has official `function_calling=true` in Cloudflare model metadata ### Fallback: Qwen 2.5 Coder 32B (`@cf/qwen/qwen2.5-coder-32b-instruct`) diff --git a/experiments/ai-gateway-tool-call/FINDINGS-gemma.md b/experiments/ai-gateway-tool-call/FINDINGS-gemma.md index 1fdc00e0b..00e1a2ed6 100644 --- a/experiments/ai-gateway-tool-call/FINDINGS-gemma.md +++ b/experiments/ai-gateway-tool-call/FINDINGS-gemma.md @@ -13,7 +13,7 @@ Gemma 4 26B (`@cf/google/gemma-4-26b-a4b-it`) is **strictly superior** to all te - **Gateway endpoint**: `https://gateway.ai.cloudflare.com/v1/{account_id}/sam/workers-ai/v1/chat/completions` - **Auth**: `Authorization: Bearer {CF_TOKEN}` (standard Cloudflare API token — no Unified Billing needed for Workers AI path) -- **Cost**: $0 (Workers AI free tier) +- **Cost**: Cloudflare Workers AI billing ($0.10 per 1M input tokens, $0.30 per 1M output tokens as of 2026-05-06) - **Metadata**: `cf-aig-metadata` header with userId, workspaceId, projectId, source, modelId — same schema as existing SAM proxy ## Detailed Findings @@ -60,7 +60,7 @@ Step 3: Call `calculate(expression='(F_value - 32) * 5 / 9')` where `F_value` is Step 4: Respond to the user with the weather condition and the temperature in Celsius. ``` -This provides free observability for harness traces without needing an explicit "think step by step" prompt. +This provides built-in observability for harness traces without needing an explicit "think step by step" prompt. ### 4. Harness-Style Coding Tools: PASS @@ -82,7 +82,7 @@ Tested with `grep`, `read_file`, `edit_file`, `bash` tools (the planned harness | **Workarounds needed** | **None** | 2 | None | | **CF function_calling flag** | `true` | N/A | N/A | | **Context window** | 32K | 32K | 32K | -| **Cost** | $0 (Workers AI) | $0 (Workers AI) | $0 (Workers AI) | +| **Cost** | Workers AI: $0.10/M input, $0.30/M output | Workers AI: $0.660/M input, $1.000/M output | Workers AI: $0.051/M input, $0.335/M output | ### 6. Workers AI Model Availability @@ -190,7 +190,7 @@ These workarounds should remain in the generic proxy for backward compatibility 1. **Use Gemma 4 26B as the default harness model.** It requires zero workarounds, produces reasoning traces, and has official function_calling support from Cloudflare. Qwen 2.5 Coder remains as a fallback but should not be the default. -2. **Persist the `reasoning` field in harness traces.** It provides free observability — the model's decision-making process is visible without needing "chain of thought" prompting or separate logging. +2. **Persist the `reasoning` field in harness traces.** It provides built-in observability — the model's decision-making process is visible without needing "chain of thought" prompting or separate logging. 3. **Next experiment: OpenAI model through Unified Billing.** Per the knowledge graph, the priority after Gemma is a small OpenAI model (gpt-4.1-mini) through the Unified API path. This requires `CF_AIG_TOKEN` with Unified Billing scope, which was blocked in the previous experiment. diff --git a/experiments/ai-gateway-tool-call/FINDINGS-openai.md b/experiments/ai-gateway-tool-call/FINDINGS-openai.md index fff2aeedb..7585cbc01 100644 --- a/experiments/ai-gateway-tool-call/FINDINGS-openai.md +++ b/experiments/ai-gateway-tool-call/FINDINGS-openai.md @@ -7,7 +7,7 @@ ## Summary -GPT-4.1 Mini (`gpt-4.1-mini`) is a **strong alternative** to Gemma 4 26B for SAM's harness tool-calling workloads. It passes all tests with zero workarounds, produces the most token-efficient responses, and has the fastest latency. However, it costs real money (not free-tier), does not produce reasoning traces, and routes through Cloudflare's Unified Billing (requires `cf-aig-authorization`). It is recommended as the **paid fallback** when higher quality or faster response times justify the cost over the free Gemma 4 26B default. +GPT-4.1 Mini (`gpt-4.1-mini`) is a **strong alternative** to Gemma 4 26B for SAM's harness tool-calling workloads. It passes all tests with zero workarounds, produces the most token-efficient responses, and has the fastest latency. However, it costs more than Gemma for this measured loop, does not produce reasoning traces, and routes through Cloudflare's Unified Billing (requires `cf-aig-authorization`). It is recommended as the **paid fallback** when higher quality, longer context, or faster response times justify the cost over the Cloudflare-billed Gemma 4 26B default. GPT-4.1 Nano (`gpt-4.1-nano`) exhibited tool-call quality issues (duplicate calls) and is **not recommended** for harness use without further evaluation. @@ -74,7 +74,7 @@ Tested `gpt-4.1-nano` as a potential ultra-cheap option: | **Latency (2-tool task)** | **~2.6s** | ~4.0s | ~3.0s | | **Workarounds needed** | **None** | **None** | 2 | | **Context window** | 1M | 32K | 32K | -| **Cost** | Paid (Unified Billing) | $0 (Workers AI) | $0 (Workers AI) | +| **Cost** | OpenAI via Unified Billing | Workers AI: $0.10/M input, $0.30/M output | Workers AI: $0.660/M input, $1.000/M output | | **Observability** | usage_details only | reasoning field | None | | **Duplicate tool calls** | No | No | No | @@ -189,7 +189,7 @@ cf-aig-metadata: {"userId":"...","workspaceId":"...","projectId":"...","source": | Field | GPT-4.1 Mini | Gemma 4 26B | |-------|-------------|-------------| -| `message.reasoning` | Absent | Present (free CoT) | +| `message.reasoning` | Absent | Present (built-in reasoning field) | | `message.refusal` | Present (`null`) | Absent | | `message.annotations` | Present (`[]`) | Absent | | `usage.prompt_tokens_details` | Detailed breakdown | Minimal | @@ -202,32 +202,34 @@ cf-aig-metadata: {"userId":"...","workspaceId":"...","projectId":"...","source": Pricing via Cloudflare Unified Billing (as of 2026-05-05): -| Model | Input (per 1M tokens) | Output (per 1M tokens) | Cost for 2-tool loop (606 tokens) | +| Model | Input (per 1M tokens) | Output (per 1M tokens) | Cost for measured loop | |-------|----------------------|------------------------|-----------------------------------| -| GPT-4.1 Mini | ~$0.40 | ~$1.60 | ~$0.0001 | -| GPT-4.1 Nano | ~$0.10 | ~$0.40 | ~$0.00003 | -| Gemma 4 26B | $0 | $0 | **$0** (Workers AI free tier) | +| GPT-4.1 Mini | $0.40 | $1.60 | ~$0.00014 for 225 input / 30 output tokens | +| GPT-5 Mini | $0.25 | $2.00 | ~$0.00012 for the same 225 input / 30 output token mix (next eval target) | +| GPT-4.1 Nano | $0.10 | $0.40 | ~$0.00003 for the same 225 input / 30 output token mix | +| Gemma 4 26B | $0.10 | $0.30 | ~$0.00004 for 298 input / 34 output tokens | ## Implications for SAM Harness ### Model Selection Strategy ``` -Default (free, good quality): @cf/google/gemma-4-26b-a4b-it (Workers AI) -Paid fallback (fast, efficient): gpt-4.1-mini (OpenAI via Unified Billing) +Default (low-cost, good quality): @cf/google/gemma-4-26b-a4b-it (Workers AI) +Paid fallback (fast, efficient): gpt-4.1-mini (OpenAI via Unified Billing) +Next candidate: gpt-5-mini (OpenAI via Unified Billing) ``` ### When to Use GPT-4.1 Mini Over Gemma 4 1. **Latency-sensitive operations** — GPT-4.1 Mini is ~1.5x faster per turn 2. **Long context tasks** — 1M token window vs Gemma's 32K -3. **Token budget constraints** — uses ~1.9x fewer tokens per loop (fewer billable Workers AI neurons, even though GPT costs money) +3. **Token budget constraints** — uses ~1.9x fewer tokens per loop 4. **When the user has Unified Billing enabled** — the cost per task is negligible (~$0.0001 per tool-call loop) ### When to Prefer Gemma 4 -1. **Free tier / cost-zero requirement** — Gemma 4 is completely free on Workers AI -2. **Observability needs** — Gemma's `reasoning` field provides free chain-of-thought logging +1. **Lowest measured cost** — Gemma 4 is Cloudflare-billed, but this loop was still cheaper than GPT-4.1 Mini +2. **Observability needs** — Gemma's `reasoning` field provides built-in reasoning traces 3. **No Unified Billing configured** — the OpenAI path requires `CF_AIG_TOKEN` or `CF_API_TOKEN` with billing scope 4. **Self-hosted deployments** — Workers AI requires only a standard Cloudflare token, not OpenAI billing @@ -245,15 +247,15 @@ GPT-4.1 Mini could be added to the model registry with: ## Recommendations -1. **Keep Gemma 4 26B as default harness model.** Free, good quality, reasoning traces. The cost advantage is decisive for a platform that runs many agent loops. +1. **Keep Gemma 4 26B as default harness model.** Low-cost, good quality, reasoning traces. The cost advantage is meaningful for a platform that runs many agent loops. 2. **Add GPT-4.1 Mini as a configurable paid alternative.** When users or the platform want faster/more-efficient responses and are willing to pay, GPT-4.1 Mini is the clear choice. Add it to the model registry. 3. **Do NOT use GPT-4.1 Nano for harness work.** Duplicate tool calls indicate insufficient quality for autonomous agent loops. -4. **Unified Billing is the gate.** The OpenAI path requires `cf-aig-authorization` with a billing-capable token. Self-hosters without Unified Billing cannot use this path — they fall back to Gemma 4 (free) or provide their own OpenAI API key via platform credentials. +4. **Unified Billing is the gate.** The OpenAI path requires `cf-aig-authorization` with a billing-capable token. Self-hosters without Unified Billing cannot use this path — they fall back to Gemma 4 through Workers AI billing or provide their own OpenAI API key via platform credentials. 5. **Consider a model tier system in the harness config:** - - Tier 0 (free): `@cf/google/gemma-4-26b-a4b-it` — default, always available - - Tier 1 (cheap): `gpt-4.1-mini` — fast, efficient, requires Unified Billing + - Tier 0 (low-cost Workers AI): `@cf/google/gemma-4-26b-a4b-it` — default, no separate provider credentials + - Tier 1 (small OpenAI): `gpt-4.1-mini` or `gpt-5-mini` — fast, efficient, requires Unified Billing - Tier 2 (premium): `claude-haiku-4-5` — highest quality, most expensive (also passed the test) diff --git a/experiments/ai-gateway-tool-call/FINDINGS.md b/experiments/ai-gateway-tool-call/FINDINGS.md index 8d4b09590..c022f44a3 100644 --- a/experiments/ai-gateway-tool-call/FINDINGS.md +++ b/experiments/ai-gateway-tool-call/FINDINGS.md @@ -7,7 +7,7 @@ This experiment validates multi-model tool calling through Cloudflare AI Gateway for SAM's native harness. Three provider paths were tested: Anthropic (via Unified API), OpenAI (via Unified API), and Workers AI (via dedicated path). -**Key result**: Workers AI (Qwen 2.5 Coder 32B) successfully completed a two-tool loop (`get_weather` -> `calculate`) with structured `tool_calls` responses through AI Gateway, proving the concept is viable for SAM's cost-free tier. +**Key result**: Workers AI (Qwen 2.5 Coder 32B) successfully completed a two-tool loop (`get_weather` -> `calculate`) with structured `tool_calls` responses through AI Gateway, proving the concept is viable for SAM's low-cost Cloudflare-billed tier. ## Models Tested @@ -82,11 +82,11 @@ The existing metadata schema works unchanged for multi-model tool calling. Each | Path | Billing | Cost | |------|---------|------| -| Workers AI | Free (included in Workers plan) | $0 | +| Workers AI | Cloudflare Workers AI billing | Model-specific per-token rates | | Unified API (Anthropic) | Via `cf-aig-authorization` (Unified Billing) or direct API key | Standard Anthropic pricing | | Unified API (OpenAI) | Via `cf-aig-authorization` (Unified Billing) or direct API key | Standard OpenAI pricing | -Workers AI models are the only zero-cost path. For SAM's free tier, routing all tool-call work through Workers AI models avoids per-token costs entirely. +Workers AI models are Cloudflare-billed, not free. For SAM's lowest-cost tier, routing tool-call work through Workers AI keeps billing inside the Cloudflare account and avoids separate provider credentials. ## Model Registry Additions @@ -103,11 +103,11 @@ The `PLATFORM_AI_MODELS` registry in `packages/shared/src/constants/ai-services. ### New Model Added -`@cf/qwen/qwen2.5-coder-32b-instruct` — Workers AI model with 32K context, good tool-call support, and zero cost. Recommended as the primary free-tier model for tool-calling workloads. +`@cf/qwen/qwen2.5-coder-32b-instruct` — Workers AI model with 32K context and good tool-call support. Recommended as the primary low-cost Workers AI model for tool-calling workloads in this baseline, superseded by Gemma 4 in the 2026-05-05 follow-up. ## Recommendations for SAM Harness -1. **Use Workers AI path for free-tier tool calling**. The Unified API (`/compat/chat/completions`) does not support Workers AI models — they must use the dedicated `/workers-ai/v1/chat/completions` path. +1. **Use Workers AI path for low-cost Cloudflare-billed tool calling**. The Unified API (`/compat/chat/completions`) does not support Workers AI models — they must use the dedicated `/workers-ai/v1/chat/completions` path. 2. **Apply Workers AI workarounds in the proxy layer**, not in calling agents: - Normalize `content: null` → `""` in assistant messages before forwarding to Workers AI diff --git a/packages/shared/src/constants/ai-services.ts b/packages/shared/src/constants/ai-services.ts index 8317222dd..7fc211b6b 100644 --- a/packages/shared/src/constants/ai-services.ts +++ b/packages/shared/src/constants/ai-services.ts @@ -115,7 +115,7 @@ export const DEFAULT_TTS_RETRY_BASE_DELAY_MS = 500; // ============================================================================= /** Default model for AI proxy inference when no admin override is set. - * Out-of-box default is a free Workers AI model — no API key required. + * Out-of-box default is a Cloudflare-billed Workers AI model — no separate provider API key required. * Admins can override via the AI Proxy admin page (stored in KV) or * the AI_PROXY_DEFAULT_MODEL env var. */ export const DEFAULT_AI_PROXY_MODEL = '@cf/meta/llama-4-scout-17b-16e-instruct'; @@ -129,7 +129,7 @@ export const DEFAULT_AI_PROXY_ANTHROPIC_MODEL = 'claude-sonnet-4-6'; export const DEFAULT_AI_PROXY_OPENAI_MODEL = 'gpt-4.1'; /** Budget tier for platform AI models. */ -export type PlatformAIModelTier = 'free' | 'standard' | 'premium'; +export type PlatformAIModelTier = 'low-cost' | 'standard' | 'premium'; /** Tool-call reliability tier for agent loop suitability. */ export type ToolCallSupport = 'excellent' | 'good' | 'limited' | 'none'; @@ -150,7 +150,7 @@ export interface PlatformAIModel { isDefault?: boolean; /** Provider for the model (determines routing in AI proxy) */ provider: 'workers-ai' | 'anthropic' | 'openai'; - /** Budget tier: free (Workers AI free tier), standard, or premium */ + /** Budget tier: Cloudflare-billed low-cost, standard, or premium */ tier: PlatformAIModelTier; /** Approximate cost per 1K input tokens (USD) for budget estimation. Actual costs from AI Gateway logs. */ costPer1kInputTokens: number; @@ -180,22 +180,22 @@ export interface PlatformAIModel { /** Models available through the SAM Platform AI proxy. * This is the single source of truth — the DEFAULT_AI_PROXY_ALLOWED_MODELS * string and the UI dropdown both derive from this list. - * Includes Workers AI (free, Cloudflare-hosted), Anthropic, and OpenAI + * Includes Workers AI (Cloudflare-billed), Anthropic, and OpenAI * models routed through Cloudflare AI Gateway with Unified Billing. */ export const PLATFORM_AI_MODELS: PlatformAIModel[] = [ - // --- Workers AI (free tier) --- + // --- Workers AI (Cloudflare-billed low-cost tier) --- { id: '@cf/meta/llama-4-scout-17b-16e-instruct', label: 'Llama 4 Scout 17B', isDefault: true, provider: 'workers-ai', - tier: 'free', - costPer1kInputTokens: 0, - costPer1kOutputTokens: 0, + tier: 'low-cost', + costPer1kInputTokens: 0.00027, + costPer1kOutputTokens: 0.00085, contextWindow: 131072, toolCallSupport: 'limited', intendedRole: 'utility', - fallbackGroup: 'free-general', + fallbackGroup: 'workers-general', allowedScopes: ['workspace'], unifiedApiModelId: null, }, @@ -203,13 +203,13 @@ export const PLATFORM_AI_MODELS: PlatformAIModel[] = [ id: '@cf/qwen/qwen3-30b-a3b-fp8', label: 'Qwen 3 30B', provider: 'workers-ai', - tier: 'free', - costPer1kInputTokens: 0, - costPer1kOutputTokens: 0, + tier: 'low-cost', + costPer1kInputTokens: 0.000051, + costPer1kOutputTokens: 0.000335, contextWindow: 32768, toolCallSupport: 'good', intendedRole: 'workspace-agent', - fallbackGroup: 'free-coding', + fallbackGroup: 'workers-coding', allowedScopes: ['workspace'], unifiedApiModelId: null, }, @@ -217,13 +217,13 @@ export const PLATFORM_AI_MODELS: PlatformAIModel[] = [ id: '@cf/qwen/qwen2.5-coder-32b-instruct', label: 'Qwen 2.5 Coder 32B', provider: 'workers-ai', - tier: 'free', - costPer1kInputTokens: 0, - costPer1kOutputTokens: 0, + tier: 'low-cost', + costPer1kInputTokens: 0.00066, + costPer1kOutputTokens: 0.001, contextWindow: 32768, toolCallSupport: 'good', intendedRole: 'workspace-agent', - fallbackGroup: 'free-coding', + fallbackGroup: 'workers-coding', allowedScopes: ['workspace'], unifiedApiModelId: null, }, @@ -231,13 +231,13 @@ export const PLATFORM_AI_MODELS: PlatformAIModel[] = [ id: '@cf/google/gemma-4-26b-a4b-it', label: 'Gemma 4 26B', provider: 'workers-ai', - tier: 'free', - costPer1kInputTokens: 0, - costPer1kOutputTokens: 0, + tier: 'low-cost', + costPer1kInputTokens: 0.0001, + costPer1kOutputTokens: 0.0003, contextWindow: 32768, toolCallSupport: 'good', intendedRole: 'workspace-agent', - fallbackGroup: 'free-coding', + fallbackGroup: 'workers-coding', allowedScopes: ['workspace'], unifiedApiModelId: null, }, @@ -245,13 +245,13 @@ export const PLATFORM_AI_MODELS: PlatformAIModel[] = [ id: '@cf/google/gemma-3-12b-it', label: 'Gemma 3 12B', provider: 'workers-ai', - tier: 'free', - costPer1kInputTokens: 0, - costPer1kOutputTokens: 0, + tier: 'low-cost', + costPer1kInputTokens: 0.00035, + costPer1kOutputTokens: 0.00056, contextWindow: 32768, toolCallSupport: 'none', intendedRole: 'utility', - fallbackGroup: 'free-utility', + fallbackGroup: 'workers-utility', allowedScopes: ['workspace'], unifiedApiModelId: null, }, @@ -261,8 +261,8 @@ export const PLATFORM_AI_MODELS: PlatformAIModel[] = [ label: 'Claude Haiku 4.5', provider: 'anthropic', tier: 'standard', - costPer1kInputTokens: 0.0008, - costPer1kOutputTokens: 0.004, + costPer1kInputTokens: 0.001, + costPer1kOutputTokens: 0.005, contextWindow: 200000, toolCallSupport: 'excellent', intendedRole: 'utility', @@ -289,8 +289,8 @@ export const PLATFORM_AI_MODELS: PlatformAIModel[] = [ label: 'Claude Opus 4.6', provider: 'anthropic', tier: 'premium', - costPer1kInputTokens: 0.015, - costPer1kOutputTokens: 0.075, + costPer1kInputTokens: 0.005, + costPer1kOutputTokens: 0.025, contextWindow: 200000, toolCallSupport: 'excellent', intendedRole: 'sam-agent', diff --git a/packages/shared/tests/unit/ai-model-registry.test.ts b/packages/shared/tests/unit/ai-model-registry.test.ts index 53b8baa13..375b47c52 100644 --- a/packages/shared/tests/unit/ai-model-registry.test.ts +++ b/packages/shared/tests/unit/ai-model-registry.test.ts @@ -48,20 +48,22 @@ describe('AI Model Registry', () => { } }); - it('free-tier models have zero cost', () => { - const freeModels = PLATFORM_AI_MODELS.filter((m) => m.tier === 'free'); - for (const model of freeModels) { - expect(model.costPer1kInputTokens, `Free model ${model.id} should have zero input cost`).toBe(0); - expect(model.costPer1kOutputTokens, `Free model ${model.id} should have zero output cost`).toBe(0); + it('low-cost models are Workers AI models with Cloudflare billing metadata', () => { + const lowCostModels = PLATFORM_AI_MODELS.filter((m) => m.tier === 'low-cost'); + expect(lowCostModels.length).toBeGreaterThan(0); + + for (const model of lowCostModels) { + expect(model.provider, `Low-cost model ${model.id} should route through Workers AI`).toBe('workers-ai'); + expect(model.costPer1kInputTokens, `Low-cost model ${model.id} should have input cost metadata`).toBeGreaterThan(0); + expect(model.costPer1kOutputTokens, `Low-cost model ${model.id} should have output cost metadata`).toBeGreaterThan(0); } }); - it('paid-tier models have non-zero cost', () => { - const paidModels = PLATFORM_AI_MODELS.filter((m) => m.tier !== 'free'); - for (const model of paidModels) { + it('all catalog models have non-zero cost metadata', () => { + for (const model of PLATFORM_AI_MODELS) { expect( model.costPer1kInputTokens > 0 || model.costPer1kOutputTokens > 0, - `Paid model ${model.id} should have non-zero cost`, + `Model ${model.id} should have non-zero cost metadata`, ).toBe(true); } }); @@ -255,9 +257,9 @@ describe('AI Model Registry', () => { id: 'model-good', label: 'Good Model', provider: 'workers-ai', - tier: 'free', - costPer1kInputTokens: 0, - costPer1kOutputTokens: 0, + tier: 'low-cost', + costPer1kInputTokens: 0.0001, + costPer1kOutputTokens: 0.0003, contextWindow: 32768, toolCallSupport: 'good', intendedRole: 'workspace-agent', @@ -269,9 +271,9 @@ describe('AI Model Registry', () => { id: 'model-limited', label: 'Limited Model', provider: 'workers-ai', - tier: 'free', - costPer1kInputTokens: 0, - costPer1kOutputTokens: 0, + tier: 'low-cost', + costPer1kInputTokens: 0.0001, + costPer1kOutputTokens: 0.0003, contextWindow: 131072, toolCallSupport: 'limited', intendedRole: 'utility', @@ -283,9 +285,9 @@ describe('AI Model Registry', () => { id: 'model-none', label: 'No Tool Model', provider: 'workers-ai', - tier: 'free', - costPer1kInputTokens: 0, - costPer1kOutputTokens: 0, + tier: 'low-cost', + costPer1kInputTokens: 0.0001, + costPer1kOutputTokens: 0.0003, contextWindow: 8192, toolCallSupport: 'none', intendedRole: 'utility', From 4c49b81a6f0d380f80ce6bb2f4a27f3232c9fd84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Titsworth-Morin?= Date: Wed, 6 May 2026 02:20:06 +0000 Subject: [PATCH 3/3] chore: refresh PR evidence checks