From 02ef950aebc725cd4a563f2e82e3ef1c5f46e5b1 Mon Sep 17 00:00:00 2001 From: hshum Date: Tue, 2 Jun 2026 00:32:15 -0700 Subject: [PATCH] feat(llm): route search.ts classify/extract/synthesize through the LLM router Track B of the LLM-router rollout. The /search route hardcoded gemini-3.1-flash-lite-preview at 7 Gemini call sites. Wire each through the shared planner-on-a-pool router (shared/llm/router.ts) so model choice is owned in one place and long / analytical / multi-entity turns can escalate. ADDITIVE + behavior-preserving: the floor of every classify/extract/synthesize pool is the same flash-lite model, and signals are derived cheaply + locally (query length, retrieved source count, multiEntity for comparison branches), so a simple single-entity query routes to the exact same model as before. Call sites wired (server/routes/search.ts): - classifyQueryWithLLM (query classification) -> routeLLM classify [single-candidate pool, guaranteed no-op] - agent_synthesize trace (synthesizeResults) -> routeLLM synthesize [surfaces chosen model + reason in trace; wire-level call lives in agentHarness.ts] - why-this-team credibility enrichment -> routeLLM extract - multi-entity comparison extraction -> routeLLM extract (multiEntity true) - single-entity extraction -> routeLLM extract - founder-direction extraction -> routeLLM extract Observability: the chosen model lands in each trace step tool field and the route reason is appended to the step detail, matching the existing SearchTraceEntry shape exactly. Reliability (.claude/rules/agentic_reliability.md): searchRouteSignals is a pure function (no Date/random) so routing is DETERMINISTIC + replay-safe; NaN/negative source counts are coerced to 0. The AbortController/Promise.race budget gates and the grounding pipeline are untouched. Tests: server/searchRouteLlmRouting.test.ts -- scenario-based (founder lookup, investor comparison, banker diligence), asserting the no-op floor for simple queries, escalation for hard turns, classify-never-escalates, and determinism. Verification: tsc --noEmit clean; vitest 21 routing + 25 existing search-route tests pass; npm run build clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- server/routes/search.ts | 113 ++++++++++++++++++--- server/searchRouteLlmRouting.test.ts | 140 +++++++++++++++++++++++++++ 2 files changed, 240 insertions(+), 13 deletions(-) create mode 100644 server/searchRouteLlmRouting.test.ts diff --git a/server/routes/search.ts b/server/routes/search.ts index 35753bcae..0d0f00133 100644 --- a/server/routes/search.ts +++ b/server/routes/search.ts @@ -66,8 +66,52 @@ import { shouldStreamAnswer, type RetrievalState, } from "../../convex/domains/agents/safety/lowConfidenceGuard.js"; +import { routeLLM, type RouteSignals, type TaskClass } from "../../shared/llm/router.js"; const SEARCH_SOURCE = "search_api"; + +// ── LLM model routing for the search pipeline ──────────────────────────────── +// Every Gemini call in this route used to hardcode `gemini-3.1-flash-lite-preview`. +// We now derive cheap, deterministic signals from local context and ask the shared +// router which model to use. The FLOOR of the classify/extract/synthesize pools is +// that same flash-lite model, so for simple queries this is a behavior-preserving +// no-op — only long, multi-entity, or analytical turns escalate to a heavier model. +// See shared/llm/router.ts + docs/architecture/LLM_ROUTER.md. +const SEARCH_ANALYTICAL_RE = + /\b(compare|comparison|versus|vs\.?|trade-?offs?|why|how should|strateg(y|ic)|implications?|pros and cons|risks?|diligence|teardown)\b/i; + +/** + * Derive deterministic routing signals from the raw query + the number of + * retrieved sources we are about to synthesize/extract over. Pure function — no + * Date/random — so routing is replay-safe (DETERMINISTIC, agentic_reliability.md). + */ +export function searchRouteSignals( + query: string, + sourceCount: number, + opts: { multiEntity?: boolean } = {}, +): RouteSignals { + const q = (query || "").trim(); + const analytical = SEARCH_ANALYTICAL_RE.test(q); + return { + inputChars: q.length, + sourceCount: Number.isFinite(sourceCount) && sourceCount > 0 ? sourceCount : 0, + multiEntity: opts.multiEntity ?? false, + complexityHint: analytical ? "high" : q.length > 240 ? "medium" : "low", + }; +} + +/** + * Route a search-pipeline LLM call and return the chosen model id plus a short, + * trace-friendly reason string (`""`). The reason is surfaced in + * the search trace's `detail` so escalations are observable. + */ +function routeSearchModel( + taskClass: TaskClass, + signals: RouteSignals, +): { model: string; detail: string } { + const decision = routeLLM(taskClass, signals); + return { model: decision.model, detail: `${decision.model} — ${decision.reason}` }; +} const CONTROL_PLANE_VIEW_ID = "view:control-plane"; const LENS_PERSONA_MAP: Record = { founder: "FOUNDER_STRATEGY", @@ -2389,8 +2433,14 @@ export function createSearchRouter(tools: McpTool[]) { const fullPrompt = sessionContext ? `${sessionContext}\n\nNow classify this query:\n${query}` : query; + // classify pool is single-candidate (flash-lite) — routing is a deterministic + // no-op here, but keeps the model id owned by the shared router. + const { model: classifyModel } = routeSearchModel( + "classify", + searchRouteSignals(query, 0), + ); const resp = await fetch( - `https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent?key=${apiKey}`, + `https://generativelanguage.googleapis.com/v1beta/models/${classifyModel}:generateContent?key=${apiKey}`, { method: "POST", headers: { "Content-Type": "application/json" }, @@ -2920,7 +2970,17 @@ Entity extraction rules: // Synthesize results into a structured packet checkBudget(); - const synthTrace = traceStep("agent_synthesize", "gemini-3.1-flash-lite"); + // Route the synthesize model from query complexity + the number of tool + // results we are folding into the answer. Floor is flash-lite (current + // behavior); long/analytical/many-source turns escalate. The model is + // surfaced in the trace below for observability. (The wire-level model + // for synthesizeResults itself lives in server/agentHarness.ts; this + // routes + labels the search-side decision.) + const synthRoute = routeSearchModel( + "synthesize", + searchRouteSignals(query, execution.stepResults.length), + ); + const synthTrace = traceStep("agent_synthesize", synthRoute.model); const synthesized = await Promise.race([ synthesizeResults( execution, @@ -2936,7 +2996,7 @@ Entity extraction rules: else setTimeout(() => reject(new Error("Request budget exceeded")), remaining); }), ]); - synthTrace.ok(`${synthesized.confidence}% confidence`); + synthTrace.ok(`${synthesized.confidence}% confidence · ${synthRoute.detail}`); // ── Parallel enrichment: Monte Carlo + Why This Team credibility ── // Both run concurrently after synthesis to stay within Vercel timeout. @@ -2992,8 +3052,15 @@ Entity extraction rules: } catch { /* local context is best-effort */ } } + // Credibility enrichment is a structured extraction over the + // synthesized result + local context — route as "extract" (floor + // flash-lite, escalates only on heavy local context). + const { model: credModel } = routeSearchModel( + "extract", + searchRouteSignals(query, 0), + ); const credResp = await fetch( - `https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent?key=${process.env.GEMINI_API_KEY}`, + `https://generativelanguage.googleapis.com/v1beta/models/${credModel}:generateContent?key=${process.env.GEMINI_API_KEY}`, { method: "POST", headers: { "Content-Type": "application/json" }, @@ -3361,11 +3428,21 @@ Entity extraction rules: // Use Gemini to produce a comparative analysis let comparison: any = null; if (process.env.GEMINI_API_KEY) { - const extractTrace = traceStep("llm_extract", "gemini-3.1-flash-lite-preview"); + // Multi-entity comparison — inherently multiEntity, so this branch is + // the most likely to escalate above the flash-lite floor. + const extractRoute = routeSearchModel( + "extract", + searchRouteSignals( + query, + entityResults.reduce((s, e) => s + (e.resultCount ?? 0), 0), + { multiEntity: true }, + ), + ); + const extractTrace = traceStep("llm_extract", extractRoute.model); try { const entityContext = entityResults.map(e => `## ${e.name}\n${e.answer ? e.answer.slice(0, 400) + "\n" : ""}${e.snippets.slice(0, 2).join("\n")}`).join("\n\n"); const geminiResp = await fetch( - `https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent?key=${process.env.GEMINI_API_KEY}`, + `https://generativelanguage.googleapis.com/v1beta/models/${extractRoute.model}:generateContent?key=${process.env.GEMINI_API_KEY}`, { method: "POST", headers: { "Content-Type": "application/json" }, @@ -3397,7 +3474,7 @@ Return ONLY valid JSON: if (jsonMatch) comparison = JSON.parse(jsonMatch[0].replace(/,\s*([\]}])/g, "$1")); } } - extractTrace.ok(`extracted ${comparison ? "ok" : "empty"}`); + extractTrace.ok(`extracted ${comparison ? "ok" : "empty"} · ${extractRoute.detail}`); } catch { extractTrace.error("gemini comparison failed"); } } @@ -3513,10 +3590,15 @@ Return ONLY valid JSON: let geminiExtracted: any = null; const hasSearchData = linkupAnswer.length > 20 || allSnippets.length > 0; if (hasSearchData && process.env.GEMINI_API_KEY) { - const extractTrace = traceStep("llm_extract", "gemini-3.1-flash-lite-preview"); + // Single-entity structured extraction over the gathered snippets. + const extractRoute = routeSearchModel( + "extract", + searchRouteSignals(query, allSnippets.length), + ); + const extractTrace = traceStep("llm_extract", extractRoute.model); try { const geminiResp = await fetch( - `https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent?key=${process.env.GEMINI_API_KEY}`, + `https://generativelanguage.googleapis.com/v1beta/models/${extractRoute.model}:generateContent?key=${process.env.GEMINI_API_KEY}`, { method: "POST", headers: { "Content-Type": "application/json" }, @@ -3566,7 +3648,7 @@ Return ONLY valid JSON: } } } - extractTrace.ok(`extracted ${geminiExtracted ? "ok" : "empty"}`); + extractTrace.ok(`extracted ${geminiExtracted ? "ok" : "empty"} · ${extractRoute.detail}`); } catch { extractTrace.error("gemini extraction failed"); } } @@ -3736,10 +3818,15 @@ Return ONLY valid JSON: // If we have web data, use Gemini to extract structured analysis let genGemini: any = null; if (genWebSnippets.length >= 2 && process.env.GEMINI_API_KEY) { - const ext = traceStep("llm_extract", "gemini-3.1-flash-lite-preview"); + // Founder-direction extraction over gathered web snippets. + const extRoute = routeSearchModel( + "extract", + searchRouteSignals(query, genWebSnippets.length), + ); + const ext = traceStep("llm_extract", extRoute.model); try { const resp = await fetch( - `https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent?key=${process.env.GEMINI_API_KEY}`, + `https://generativelanguage.googleapis.com/v1beta/models/${extRoute.model}:generateContent?key=${process.env.GEMINI_API_KEY}`, { method: "POST", headers: { "Content-Type": "application/json" }, @@ -3773,7 +3860,7 @@ RULES: Only include facts grounded in the web data. If data is thin, return fewe if (m) genGemini = JSON.parse(m[0].replace(/,\s*([\]}])/g, "$1")); } } - ext.ok(genGemini ? "ok" : "empty"); + ext.ok(`${genGemini ? "ok" : "empty"} · ${extRoute.detail}`); } catch { ext.error("extraction failed"); } } diff --git a/server/searchRouteLlmRouting.test.ts b/server/searchRouteLlmRouting.test.ts new file mode 100644 index 000000000..19a21b5a9 --- /dev/null +++ b/server/searchRouteLlmRouting.test.ts @@ -0,0 +1,140 @@ +/** + * Scenario-based tests for the search route's LLM-router wiring + * (server/routes/search.ts `searchRouteSignals` + the shared router). + * + * Track B of the LLM-router rollout routes the search pipeline's Gemini calls + * (classify / extract / synthesize) through shared/llm/router.ts instead of a + * hardcoded `gemini-3.1-flash-lite-preview`. The whole point is that this is + * BEHAVIOR-PRESERVING for the common case: the floor of every pool is that same + * flash-lite model, so a short, single-entity, non-analytical query MUST still + * resolve to flash-lite. Only long / analytical / many-source / multi-entity + * turns may escalate. + * + * Per .claude/rules/scenario_testing.md each test starts from a real persona + + * goal. The risks we are guarding against: + * - Regression that ESCALATES the simple case (cost blowup + a non-no-op rollout). + * - Regression that FAILS to escalate the genuinely hard comparison case. + * - Non-determinism (same query routing differently across calls — breaks replay). + * + * Per .claude/rules/agentic_reliability.md DETERMINISTIC: routeLLM + signal + * derivation are pure, so identical inputs must always yield the identical model. + */ +import { describe, expect, it } from "vitest"; + +import { searchRouteSignals } from "./routes/search.js"; +import { routeLLM } from "../shared/llm/router.js"; + +const FLASH_LITE = "gemini-3.1-flash-lite-preview"; // the production floor before this change +const FLASH = "gemini-3-flash-preview"; // the escalation target for extract/synthesize + +describe("search route LLM routing — behavior-preserving floor (the no-op guarantee)", () => { + /** + * Persona: a founder types a bare company name to pull a quick entity card. + * Goal: fast, cheap single-entity lookup. + * Prior state: a normal single-entity search with a handful of snippets. + * Expected: extract stays on the flash-lite floor — IDENTICAL to pre-router + * behavior. If this ever escalates, the "additive no-op" promise is broken. + */ + it("keeps a short single-entity extract on the flash-lite floor", () => { + const sig = searchRouteSignals("Mercury", 3); + const r = routeLLM("extract", sig); + expect(r.model).toBe(FLASH_LITE); + expect(r.tier).toBe("light"); + expect(r.escalated).toBe(false); + }); + + /** + * Persona: same founder, slightly longer plain-language query, few sources. + * Expected: still the floor — length alone (under the medium threshold) and a + * non-analytical phrasing must not trip escalation. + */ + it("keeps a medium plain-language single-entity query on the floor", () => { + const sig = searchRouteSignals("Tell me about the company Linear and what they do", 4); + const r = routeLLM("extract", sig); + expect(r.model).toBe(FLASH_LITE); + expect(r.escalated).toBe(false); + }); + + /** + * classify is a single-candidate pool — it can NEVER escalate, no matter how + * complex the query looks. This is the strongest no-op guarantee in the route. + */ + it("classify never escalates regardless of query complexity", () => { + const heavy = searchRouteSignals( + "Compare Anthropic versus OpenAI on tool use, pricing, and the strategic risks for a multi-tenant deployment", + 12, + { multiEntity: true }, + ); + const r = routeLLM("classify", heavy); + expect(r.model).toBe(FLASH_LITE); + expect(r.tier).toBe("light"); + expect(r.escalated).toBe(false); + }); +}); + +describe("search route LLM routing — escalation on genuinely hard turns", () => { + /** + * Persona: an investor runs a head-to-head comparison ("X vs Y") across many + * gathered sources. Goal: a well-reasoned comparative synthesis. + * Prior state: the multi-entity branch gathered results for 2+ entities. + * Expected: extract escalates above the flash-lite floor — multiEntity plus an + * analytical verb plus many sources together cross the heavy threshold. + */ + it("escalates a multi-entity comparison with many sources", () => { + const sig = searchRouteSignals( + "Compare Stripe versus Adyen for cross-border payments — tradeoffs and risks", + 14, + { multiEntity: true }, + ); + const r = routeLLM("extract", sig); + expect(r.escalated).toBe(true); + expect(r.model).toBe(FLASH); + expect(r.tier).not.toBe("light"); + }); + + /** + * Persona: a banker asks a long analytical diligence question over a deep + * source set. Expected: synthesize escalates — "diligence" + length + sources. + */ + it("escalates a long analytical diligence synthesis", () => { + const q = + "Give me a full diligence teardown of why this company's moat is defensible, what the strategic risks are, and how the funding trajectory implies their runway under a downturn scenario at scale."; + const r = routeLLM("synthesize", searchRouteSignals(q, 9)); + expect(r.escalated).toBe(true); + expect(r.model).toBe(FLASH); + }); + + /** + * Boundary: a short but explicitly multi-entity comparison with only a couple + * of sources. multiEntity (0.3) + analytical "compare" hint (0.5) already clears + * the escalate threshold (0.5) even without source weight, so it should escalate. + */ + it("escalates a short multi-entity comparison even with few sources", () => { + const sig = searchRouteSignals("compare Brave and Serper", 2, { multiEntity: true }); + const r = routeLLM("extract", sig); + expect(r.escalated).toBe(true); + }); +}); + +describe("search route LLM routing — determinism (replay safety)", () => { + it("derives identical signals + route for identical inputs", () => { + const a = searchRouteSignals("How should we evaluate voice agents for latency?", 5); + const b = searchRouteSignals("How should we evaluate voice agents for latency?", 5); + expect(a).toEqual(b); + expect(routeLLM("extract", a)).toEqual(routeLLM("extract", b)); + }); + + it("coerces missing / negative source counts to a safe 0 (no NaN leaks into scoring)", () => { + expect(searchRouteSignals("x", Number.NaN as unknown as number).sourceCount).toBe(0); + expect(searchRouteSignals("x", -5).sourceCount).toBe(0); + // and the route is still deterministic + on the floor for this trivial input + const r = routeLLM("extract", searchRouteSignals("x", Number.NaN as unknown as number)); + expect(r.model).toBe(FLASH_LITE); + expect(r.escalated).toBe(false); + }); + + it("flags analytical intent as high complexity, plain lookups as low", () => { + expect(searchRouteSignals("Why did valuations compress this quarter?", 3).complexityHint).toBe("high"); + expect(searchRouteSignals("Acme", 1).complexityHint).toBe("low"); + }); +});