From 02ef950aebc725cd4a563f2e82e3ef1c5f46e5b1 Mon Sep 17 00:00:00 2001
From: hshum <hshum@users.noreply.github.com>
Date: Tue, 2 Jun 2026 00:32:15 -0700
Subject: [PATCH] feat(llm): route search.ts classify/extract/synthesize
 through the LLM router

Track B of the LLM-router rollout. The /search route hardcoded
gemini-3.1-flash-lite-preview at 7 Gemini call sites. Wire each through the
shared planner-on-a-pool router (shared/llm/router.ts) so model choice is owned
in one place and long / analytical / multi-entity turns can escalate.

ADDITIVE + behavior-preserving: the floor of every classify/extract/synthesize
pool is the same flash-lite model, and signals are derived cheaply + locally
(query length, retrieved source count, multiEntity for comparison branches), so
a simple single-entity query routes to the exact same model as before.

Call sites wired (server/routes/search.ts):
- classifyQueryWithLLM (query classification)  -> routeLLM classify [single-candidate pool, guaranteed no-op]
- agent_synthesize trace (synthesizeResults)   -> routeLLM synthesize [surfaces chosen model + reason in trace; wire-level call lives in agentHarness.ts]
- why-this-team credibility enrichment         -> routeLLM extract
- multi-entity comparison extraction           -> routeLLM extract (multiEntity true)
- single-entity extraction                     -> routeLLM extract
- founder-direction extraction                 -> routeLLM extract

Observability: the chosen model lands in each trace step tool field and the
route reason is appended to the step detail, matching the existing
SearchTraceEntry shape exactly.

Reliability (.claude/rules/agentic_reliability.md): searchRouteSignals is a pure
function (no Date/random) so routing is DETERMINISTIC + replay-safe; NaN/negative
source counts are coerced to 0. The AbortController/Promise.race budget gates and
the grounding pipeline are untouched.

Tests: server/searchRouteLlmRouting.test.ts -- scenario-based (founder lookup,
investor comparison, banker diligence), asserting the no-op floor for simple
queries, escalation for hard turns, classify-never-escalates, and determinism.

Verification: tsc --noEmit clean; vitest 21 routing + 25 existing search-route
tests pass; npm run build clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 server/routes/search.ts              | 113 ++++++++++++++++++---
 server/searchRouteLlmRouting.test.ts | 140 +++++++++++++++++++++++++++
 2 files changed, 240 insertions(+), 13 deletions(-)
 create mode 100644 server/searchRouteLlmRouting.test.ts

diff --git a/server/routes/search.ts b/server/routes/search.ts
index 35753bcae..0d0f00133 100644
--- a/server/routes/search.ts
+++ b/server/routes/search.ts
@@ -66,8 +66,52 @@ import {
   shouldStreamAnswer,
   type RetrievalState,
 } from "../../convex/domains/agents/safety/lowConfidenceGuard.js";
+import { routeLLM, type RouteSignals, type TaskClass } from "../../shared/llm/router.js";
 
 const SEARCH_SOURCE = "search_api";
+
+// ── LLM model routing for the search pipeline ────────────────────────────────
+// Every Gemini call in this route used to hardcode `gemini-3.1-flash-lite-preview`.
+// We now derive cheap, deterministic signals from local context and ask the shared
+// router which model to use. The FLOOR of the classify/extract/synthesize pools is
+// that same flash-lite model, so for simple queries this is a behavior-preserving
+// no-op — only long, multi-entity, or analytical turns escalate to a heavier model.
+// See shared/llm/router.ts + docs/architecture/LLM_ROUTER.md.
+const SEARCH_ANALYTICAL_RE =
+  /\b(compare|comparison|versus|vs\.?|trade-?offs?|why|how should|strateg(y|ic)|implications?|pros and cons|risks?|diligence|teardown)\b/i;
+
+/**
+ * Derive deterministic routing signals from the raw query + the number of
+ * retrieved sources we are about to synthesize/extract over. Pure function — no
+ * Date/random — so routing is replay-safe (DETERMINISTIC, agentic_reliability.md).
+ */
+export function searchRouteSignals(
+  query: string,
+  sourceCount: number,
+  opts: { multiEntity?: boolean } = {},
+): RouteSignals {
+  const q = (query || "").trim();
+  const analytical = SEARCH_ANALYTICAL_RE.test(q);
+  return {
+    inputChars: q.length,
+    sourceCount: Number.isFinite(sourceCount) && sourceCount > 0 ? sourceCount : 0,
+    multiEntity: opts.multiEntity ?? false,
+    complexityHint: analytical ? "high" : q.length > 240 ? "medium" : "low",
+  };
+}
+
+/**
+ * Route a search-pipeline LLM call and return the chosen model id plus a short,
+ * trace-friendly reason string (`"<model> — <reason>"`). The reason is surfaced in
+ * the search trace's `detail` so escalations are observable.
+ */
+function routeSearchModel(
+  taskClass: TaskClass,
+  signals: RouteSignals,
+): { model: string; detail: string } {
+  const decision = routeLLM(taskClass, signals);
+  return { model: decision.model, detail: `${decision.model} — ${decision.reason}` };
+}
 const CONTROL_PLANE_VIEW_ID = "view:control-plane";
 const LENS_PERSONA_MAP: Record<string, string> = {
   founder: "FOUNDER_STRATEGY",
@@ -2389,8 +2433,14 @@ export function createSearchRouter(tools: McpTool[]) {
       const fullPrompt = sessionContext
         ? `${sessionContext}\n\nNow classify this query:\n${query}`
         : query;
+      // classify pool is single-candidate (flash-lite) — routing is a deterministic
+      // no-op here, but keeps the model id owned by the shared router.
+      const { model: classifyModel } = routeSearchModel(
+        "classify",
+        searchRouteSignals(query, 0),
+      );
       const resp = await fetch(
-        `https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent?key=${apiKey}`,
+        `https://generativelanguage.googleapis.com/v1beta/models/${classifyModel}:generateContent?key=${apiKey}`,
         {
           method: "POST",
           headers: { "Content-Type": "application/json" },
@@ -2920,7 +2970,17 @@ Entity extraction rules:
 
           // Synthesize results into a structured packet
           checkBudget();
-          const synthTrace = traceStep("agent_synthesize", "gemini-3.1-flash-lite");
+          // Route the synthesize model from query complexity + the number of tool
+          // results we are folding into the answer. Floor is flash-lite (current
+          // behavior); long/analytical/many-source turns escalate. The model is
+          // surfaced in the trace below for observability. (The wire-level model
+          // for synthesizeResults itself lives in server/agentHarness.ts; this
+          // routes + labels the search-side decision.)
+          const synthRoute = routeSearchModel(
+            "synthesize",
+            searchRouteSignals(query, execution.stepResults.length),
+          );
+          const synthTrace = traceStep("agent_synthesize", synthRoute.model);
           const synthesized = await Promise.race([
             synthesizeResults(
               execution,
@@ -2936,7 +2996,7 @@ Entity extraction rules:
               else setTimeout(() => reject(new Error("Request budget exceeded")), remaining);
             }),
           ]);
-          synthTrace.ok(`${synthesized.confidence}% confidence`);
+          synthTrace.ok(`${synthesized.confidence}% confidence · ${synthRoute.detail}`);
 
           // ── Parallel enrichment: Monte Carlo + Why This Team credibility ──
           // Both run concurrently after synthesis to stay within Vercel timeout.
@@ -2992,8 +3052,15 @@ Entity extraction rules:
                   } catch { /* local context is best-effort */ }
                 }
 
+                // Credibility enrichment is a structured extraction over the
+                // synthesized result + local context — route as "extract" (floor
+                // flash-lite, escalates only on heavy local context).
+                const { model: credModel } = routeSearchModel(
+                  "extract",
+                  searchRouteSignals(query, 0),
+                );
                 const credResp = await fetch(
-                  `https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent?key=${process.env.GEMINI_API_KEY}`,
+                  `https://generativelanguage.googleapis.com/v1beta/models/${credModel}:generateContent?key=${process.env.GEMINI_API_KEY}`,
                   {
                     method: "POST",
                     headers: { "Content-Type": "application/json" },
@@ -3361,11 +3428,21 @@ Entity extraction rules:
           // Use Gemini to produce a comparative analysis
           let comparison: any = null;
           if (process.env.GEMINI_API_KEY) {
-            const extractTrace = traceStep("llm_extract", "gemini-3.1-flash-lite-preview");
+            // Multi-entity comparison — inherently multiEntity, so this branch is
+            // the most likely to escalate above the flash-lite floor.
+            const extractRoute = routeSearchModel(
+              "extract",
+              searchRouteSignals(
+                query,
+                entityResults.reduce((s, e) => s + (e.resultCount ?? 0), 0),
+                { multiEntity: true },
+              ),
+            );
+            const extractTrace = traceStep("llm_extract", extractRoute.model);
             try {
               const entityContext = entityResults.map(e => `## ${e.name}\n${e.answer ? e.answer.slice(0, 400) + "\n" : ""}${e.snippets.slice(0, 2).join("\n")}`).join("\n\n");
               const geminiResp = await fetch(
-                `https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent?key=${process.env.GEMINI_API_KEY}`,
+                `https://generativelanguage.googleapis.com/v1beta/models/${extractRoute.model}:generateContent?key=${process.env.GEMINI_API_KEY}`,
                 {
                   method: "POST",
                   headers: { "Content-Type": "application/json" },
@@ -3397,7 +3474,7 @@ Return ONLY valid JSON:
                   if (jsonMatch) comparison = JSON.parse(jsonMatch[0].replace(/,\s*([\]}])/g, "$1"));
                 }
               }
-              extractTrace.ok(`extracted ${comparison ? "ok" : "empty"}`);
+              extractTrace.ok(`extracted ${comparison ? "ok" : "empty"} · ${extractRoute.detail}`);
             } catch { extractTrace.error("gemini comparison failed"); }
           }
 
@@ -3513,10 +3590,15 @@ Return ONLY valid JSON:
           let geminiExtracted: any = null;
           const hasSearchData = linkupAnswer.length > 20 || allSnippets.length > 0;
           if (hasSearchData && process.env.GEMINI_API_KEY) {
-            const extractTrace = traceStep("llm_extract", "gemini-3.1-flash-lite-preview");
+            // Single-entity structured extraction over the gathered snippets.
+            const extractRoute = routeSearchModel(
+              "extract",
+              searchRouteSignals(query, allSnippets.length),
+            );
+            const extractTrace = traceStep("llm_extract", extractRoute.model);
             try {
               const geminiResp = await fetch(
-                `https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent?key=${process.env.GEMINI_API_KEY}`,
+                `https://generativelanguage.googleapis.com/v1beta/models/${extractRoute.model}:generateContent?key=${process.env.GEMINI_API_KEY}`,
                 {
                   method: "POST",
                   headers: { "Content-Type": "application/json" },
@@ -3566,7 +3648,7 @@ Return ONLY valid JSON:
                   }
                 }
               }
-              extractTrace.ok(`extracted ${geminiExtracted ? "ok" : "empty"}`);
+              extractTrace.ok(`extracted ${geminiExtracted ? "ok" : "empty"} · ${extractRoute.detail}`);
             } catch { extractTrace.error("gemini extraction failed"); }
           }
 
@@ -3736,10 +3818,15 @@ Return ONLY valid JSON:
           // If we have web data, use Gemini to extract structured analysis
           let genGemini: any = null;
           if (genWebSnippets.length >= 2 && process.env.GEMINI_API_KEY) {
-            const ext = traceStep("llm_extract", "gemini-3.1-flash-lite-preview");
+            // Founder-direction extraction over gathered web snippets.
+            const extRoute = routeSearchModel(
+              "extract",
+              searchRouteSignals(query, genWebSnippets.length),
+            );
+            const ext = traceStep("llm_extract", extRoute.model);
             try {
               const resp = await fetch(
-                `https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-lite-preview:generateContent?key=${process.env.GEMINI_API_KEY}`,
+                `https://generativelanguage.googleapis.com/v1beta/models/${extRoute.model}:generateContent?key=${process.env.GEMINI_API_KEY}`,
                 {
                   method: "POST",
                   headers: { "Content-Type": "application/json" },
@@ -3773,7 +3860,7 @@ RULES: Only include facts grounded in the web data. If data is thin, return fewe
                   if (m) genGemini = JSON.parse(m[0].replace(/,\s*([\]}])/g, "$1"));
                 }
               }
-              ext.ok(genGemini ? "ok" : "empty");
+              ext.ok(`${genGemini ? "ok" : "empty"} · ${extRoute.detail}`);
             } catch { ext.error("extraction failed"); }
           }
 
diff --git a/server/searchRouteLlmRouting.test.ts b/server/searchRouteLlmRouting.test.ts
new file mode 100644
index 000000000..19a21b5a9
--- /dev/null
+++ b/server/searchRouteLlmRouting.test.ts
@@ -0,0 +1,140 @@
+/**
+ * Scenario-based tests for the search route's LLM-router wiring
+ * (server/routes/search.ts `searchRouteSignals` + the shared router).
+ *
+ * Track B of the LLM-router rollout routes the search pipeline's Gemini calls
+ * (classify / extract / synthesize) through shared/llm/router.ts instead of a
+ * hardcoded `gemini-3.1-flash-lite-preview`. The whole point is that this is
+ * BEHAVIOR-PRESERVING for the common case: the floor of every pool is that same
+ * flash-lite model, so a short, single-entity, non-analytical query MUST still
+ * resolve to flash-lite. Only long / analytical / many-source / multi-entity
+ * turns may escalate.
+ *
+ * Per .claude/rules/scenario_testing.md each test starts from a real persona +
+ * goal. The risks we are guarding against:
+ *   - Regression that ESCALATES the simple case (cost blowup + a non-no-op rollout).
+ *   - Regression that FAILS to escalate the genuinely hard comparison case.
+ *   - Non-determinism (same query routing differently across calls — breaks replay).
+ *
+ * Per .claude/rules/agentic_reliability.md DETERMINISTIC: routeLLM + signal
+ * derivation are pure, so identical inputs must always yield the identical model.
+ */
+import { describe, expect, it } from "vitest";
+
+import { searchRouteSignals } from "./routes/search.js";
+import { routeLLM } from "../shared/llm/router.js";
+
+const FLASH_LITE = "gemini-3.1-flash-lite-preview"; // the production floor before this change
+const FLASH = "gemini-3-flash-preview"; // the escalation target for extract/synthesize
+
+describe("search route LLM routing — behavior-preserving floor (the no-op guarantee)", () => {
+  /**
+   * Persona: a founder types a bare company name to pull a quick entity card.
+   * Goal: fast, cheap single-entity lookup.
+   * Prior state: a normal single-entity search with a handful of snippets.
+   * Expected: extract stays on the flash-lite floor — IDENTICAL to pre-router
+   * behavior. If this ever escalates, the "additive no-op" promise is broken.
+   */
+  it("keeps a short single-entity extract on the flash-lite floor", () => {
+    const sig = searchRouteSignals("Mercury", 3);
+    const r = routeLLM("extract", sig);
+    expect(r.model).toBe(FLASH_LITE);
+    expect(r.tier).toBe("light");
+    expect(r.escalated).toBe(false);
+  });
+
+  /**
+   * Persona: same founder, slightly longer plain-language query, few sources.
+   * Expected: still the floor — length alone (under the medium threshold) and a
+   * non-analytical phrasing must not trip escalation.
+   */
+  it("keeps a medium plain-language single-entity query on the floor", () => {
+    const sig = searchRouteSignals("Tell me about the company Linear and what they do", 4);
+    const r = routeLLM("extract", sig);
+    expect(r.model).toBe(FLASH_LITE);
+    expect(r.escalated).toBe(false);
+  });
+
+  /**
+   * classify is a single-candidate pool — it can NEVER escalate, no matter how
+   * complex the query looks. This is the strongest no-op guarantee in the route.
+   */
+  it("classify never escalates regardless of query complexity", () => {
+    const heavy = searchRouteSignals(
+      "Compare Anthropic versus OpenAI on tool use, pricing, and the strategic risks for a multi-tenant deployment",
+      12,
+      { multiEntity: true },
+    );
+    const r = routeLLM("classify", heavy);
+    expect(r.model).toBe(FLASH_LITE);
+    expect(r.tier).toBe("light");
+    expect(r.escalated).toBe(false);
+  });
+});
+
+describe("search route LLM routing — escalation on genuinely hard turns", () => {
+  /**
+   * Persona: an investor runs a head-to-head comparison ("X vs Y") across many
+   * gathered sources. Goal: a well-reasoned comparative synthesis.
+   * Prior state: the multi-entity branch gathered results for 2+ entities.
+   * Expected: extract escalates above the flash-lite floor — multiEntity plus an
+   * analytical verb plus many sources together cross the heavy threshold.
+   */
+  it("escalates a multi-entity comparison with many sources", () => {
+    const sig = searchRouteSignals(
+      "Compare Stripe versus Adyen for cross-border payments — tradeoffs and risks",
+      14,
+      { multiEntity: true },
+    );
+    const r = routeLLM("extract", sig);
+    expect(r.escalated).toBe(true);
+    expect(r.model).toBe(FLASH);
+    expect(r.tier).not.toBe("light");
+  });
+
+  /**
+   * Persona: a banker asks a long analytical diligence question over a deep
+   * source set. Expected: synthesize escalates — "diligence" + length + sources.
+   */
+  it("escalates a long analytical diligence synthesis", () => {
+    const q =
+      "Give me a full diligence teardown of why this company's moat is defensible, what the strategic risks are, and how the funding trajectory implies their runway under a downturn scenario at scale.";
+    const r = routeLLM("synthesize", searchRouteSignals(q, 9));
+    expect(r.escalated).toBe(true);
+    expect(r.model).toBe(FLASH);
+  });
+
+  /**
+   * Boundary: a short but explicitly multi-entity comparison with only a couple
+   * of sources. multiEntity (0.3) + analytical "compare" hint (0.5) already clears
+   * the escalate threshold (0.5) even without source weight, so it should escalate.
+   */
+  it("escalates a short multi-entity comparison even with few sources", () => {
+    const sig = searchRouteSignals("compare Brave and Serper", 2, { multiEntity: true });
+    const r = routeLLM("extract", sig);
+    expect(r.escalated).toBe(true);
+  });
+});
+
+describe("search route LLM routing — determinism (replay safety)", () => {
+  it("derives identical signals + route for identical inputs", () => {
+    const a = searchRouteSignals("How should we evaluate voice agents for latency?", 5);
+    const b = searchRouteSignals("How should we evaluate voice agents for latency?", 5);
+    expect(a).toEqual(b);
+    expect(routeLLM("extract", a)).toEqual(routeLLM("extract", b));
+  });
+
+  it("coerces missing / negative source counts to a safe 0 (no NaN leaks into scoring)", () => {
+    expect(searchRouteSignals("x", Number.NaN as unknown as number).sourceCount).toBe(0);
+    expect(searchRouteSignals("x", -5).sourceCount).toBe(0);
+    // and the route is still deterministic + on the floor for this trivial input
+    const r = routeLLM("extract", searchRouteSignals("x", Number.NaN as unknown as number));
+    expect(r.model).toBe(FLASH_LITE);
+    expect(r.escalated).toBe(false);
+  });
+
+  it("flags analytical intent as high complexity, plain lookups as low", () => {
+    expect(searchRouteSignals("Why did valuations compress this quarter?", 3).complexityHint).toBe("high");
+    expect(searchRouteSignals("Acme", 1).complexityHint).toBe("low");
+  });
+});