TangleML · Mbeaulne · Jun 18, 2026 · Mbeaulne · Jun 18, 2026
@@ -251,4 +251,24 @@ describe("buildLexicalMatches / buildAiCandidateMatches", () => {
     const candidates = buildAiCandidateMatches(index, "qqzznomatch");
     expect(candidates.map((m) => m.digest)).toEqual(["alpha", "zebra"]);
   });
+
+  it("adds source-diverse candidates beyond the top lexical hits", () => {
+    const broadIndex = buildSearchIndex([
+      ...Array.from({ length: 100 }, (_, i) => ({
+        reference: ref(`train-${i}`, `train_${i}`),
+        source: source("standard"),
+      })),
+      {
+        reference: ref("user-upload", "upload_file"),
+        source: USER_SOURCE,
+      },
+    ]);
+
+    const candidates = buildAiCandidateMatches(broadIndex, "train");
+
+    expect(candidates).toHaveLength(80);
+    expect(candidates.map((candidate) => candidate.digest)).toContain(
+      "user-upload",
+    );
+  });
 });
@@ -29,9 +29,9 @@ import type {
 
 /** How many lexical hits to display before the user asks for AI judgment. */
 const LEXICAL_RESULT_LIMIT = 50;
-// Candidate pool sent to AI rerank on click. Matches LEXICAL_RESULT_LIMIT so
-// every displayed result is scored and can show a relevance percentage.
-const AI_CANDIDATE_LIMIT = 50;
+const AI_CANDIDATE_LIMIT = 80;
+const AI_LEXICAL_CANDIDATE_LIMIT = 60;
+const AI_SOURCE_DIVERSITY_CANDIDATES_PER_SOURCE = 8;
 // Scores at or below this are treated as the model excluding a candidate: such
 // items keep their place in the list but are not badged as relevance matches.
 const RERANK_EXCLUSION_THRESHOLD = 0.01;
@@ -281,27 +281,85 @@ export function buildLexicalMatches(
   });
 }
 
+function sampleEvenly<T>(items: T[], limit: number): T[] {
+  if (items.length <= limit) return items;
+  const step = items.length / limit;
+  return Array.from(
+    { length: limit },
+    (_, index) => items[Math.floor(index * step)],
+  );
+}
+
+function appendUniqueMatches(
+  target: LexicalMatch[],
+  seenDigests: Set<string>,
+  matches: LexicalMatch[],
+) {
+  for (const match of matches) {
+    if (seenDigests.has(match.digest)) continue;
+    seenDigests.add(match.digest);
+    target.push(match);
+    if (target.length >= AI_CANDIDATE_LIMIT) return;
+  }
+}
+
+function buildSourceDiverseBrowseMatches(index: IndexEntry[]): LexicalMatch[] {
+  const bySource = new Map<string, IndexEntry[]>();
+  for (const entry of index) {
+    const key = `${entry.source.kind}:${entry.source.id}`;
+    bySource.set(key, [...(bySource.get(key) ?? []), entry]);
+  }
+
+  const matches: LexicalMatch[] = [];
+  for (const entries of bySource.values()) {
+    const sorted = [...entries].sort((a, b) => a.name.localeCompare(b.name));
+    matches.push(
+      ...sampleEvenly(sorted, AI_SOURCE_DIVERSITY_CANDIDATES_PER_SOURCE).map(
+        indexEntryToLexicalMatch,
+      ),
+    );
+  }
+
+  return matches;
+}
+
 /**
- * Bounded candidate pool for AI rerank. Prefers broad lexical hits; when
- * literal matching finds nothing it falls back to an alphabetical browse slice
- * so natural-language queries stay useful.
+ * Bounded candidate pool for AI rerank. Starts with the strongest lexical hits,
+ * then adds a source-diverse browse sample so AI can rescue plausible matches
+ * that literal scoring missed.
  */
 export function buildAiCandidateMatches(
   index: IndexEntry[],
   trimmedQuery: string,
 ): LexicalMatch[] {
   if (trimmedQuery.length === 0) return [];
 
-  const broadMatches = lexicalSearch(index, trimmedQuery, {
-    limit: AI_CANDIDATE_LIMIT,
-    minLength: 1,
-  });
-  if (broadMatches.length > 0) return broadMatches;
+  const candidates: LexicalMatch[] = [];
+  const seenDigests = new Set<string>();
+
+  appendUniqueMatches(
+    candidates,
+    seenDigests,
+    lexicalSearch(index, trimmedQuery, {
+      limit: AI_LEXICAL_CANDIDATE_LIMIT,
+      minLength: 1,
+    }),
+  );
+
+  appendUniqueMatches(
+    candidates,
+    seenDigests,
+    buildSourceDiverseBrowseMatches(index),
+  );
+
+  const sortedIndex = [...index].sort((a, b) => a.name.localeCompare(b.name));
+  appendUniqueMatches(
+    candidates,
+    seenDigests,
+    sampleEvenly(sortedIndex, AI_CANDIDATE_LIMIT).map(indexEntryToLexicalMatch),
+  );
 
-  return [...index]
-    .sort((a, b) => a.name.localeCompare(b.name))
-    .slice(0, AI_CANDIDATE_LIMIT)
-    .map(indexEntryToLexicalMatch);
+  return candidates;
 }
 
 export function buildRerankScoreByDigest(

@@ -215,9 +215,7 @@ export function useComponentSearchV2State(
 
     if (candidates.length === 0) return;
 
-    setRerankBaseMatches(
-      lexicalMatches.length > 0 ? lexicalMatches : aiCandidateMatches,
-    );
+    setRerankBaseMatches(aiCandidateMatches);
     setRerankedFor(trimmedQuery);
     // Score every candidate so each displayed result shows a relevance %.
     mutate({ query: trimmedQuery, candidates, scoreAllCandidates: true });