Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -251,4 +251,24 @@ describe("buildLexicalMatches / buildAiCandidateMatches", () => {
const candidates = buildAiCandidateMatches(index, "qqzznomatch");
expect(candidates.map((m) => m.digest)).toEqual(["alpha", "zebra"]);
});

it("adds source-diverse candidates beyond the top lexical hits", () => {
const broadIndex = buildSearchIndex([
...Array.from({ length: 100 }, (_, i) => ({
reference: ref(`train-${i}`, `train_${i}`),
source: source("standard"),
})),
{
reference: ref("user-upload", "upload_file"),
source: USER_SOURCE,
},
]);

const candidates = buildAiCandidateMatches(broadIndex, "train");

expect(candidates).toHaveLength(80);
expect(candidates.map((candidate) => candidate.digest)).toContain(
"user-upload",
);
});
});
88 changes: 73 additions & 15 deletions src/routes/v2/pages/Editor/components/componentSearchV2Logic.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ import type {

/** How many lexical hits to display before the user asks for AI judgment. */
const LEXICAL_RESULT_LIMIT = 50;
// Candidate pool sent to AI rerank on click. Matches LEXICAL_RESULT_LIMIT so
// every displayed result is scored and can show a relevance percentage.
const AI_CANDIDATE_LIMIT = 50;
const AI_CANDIDATE_LIMIT = 80;
const AI_LEXICAL_CANDIDATE_LIMIT = 60;
const AI_SOURCE_DIVERSITY_CANDIDATES_PER_SOURCE = 8;
// Scores at or below this are treated as the model excluding a candidate: such
// items keep their place in the list but are not badged as relevance matches.
const RERANK_EXCLUSION_THRESHOLD = 0.01;
Expand Down Expand Up @@ -281,27 +281,85 @@ export function buildLexicalMatches(
});
}

function sampleEvenly<T>(items: T[], limit: number): T[] {
if (items.length <= limit) return items;
const step = items.length / limit;
return Array.from(
{ length: limit },
(_, index) => items[Math.floor(index * step)],
);
}

function appendUniqueMatches(
target: LexicalMatch[],
seenDigests: Set<string>,
matches: LexicalMatch[],
) {
for (const match of matches) {
if (seenDigests.has(match.digest)) continue;
seenDigests.add(match.digest);
target.push(match);
if (target.length >= AI_CANDIDATE_LIMIT) return;
}
}

function buildSourceDiverseBrowseMatches(index: IndexEntry[]): LexicalMatch[] {
const bySource = new Map<string, IndexEntry[]>();
for (const entry of index) {
const key = `${entry.source.kind}:${entry.source.id}`;
bySource.set(key, [...(bySource.get(key) ?? []), entry]);
}

const matches: LexicalMatch[] = [];
for (const entries of bySource.values()) {
const sorted = [...entries].sort((a, b) => a.name.localeCompare(b.name));
matches.push(
...sampleEvenly(sorted, AI_SOURCE_DIVERSITY_CANDIDATES_PER_SOURCE).map(
indexEntryToLexicalMatch,
),
);
}

return matches;
}

/**
* Bounded candidate pool for AI rerank. Prefers broad lexical hits; when
* literal matching finds nothing it falls back to an alphabetical browse slice
* so natural-language queries stay useful.
* Bounded candidate pool for AI rerank. Starts with the strongest lexical hits,
* then adds a source-diverse browse sample so AI can rescue plausible matches
* that literal scoring missed.
*/
export function buildAiCandidateMatches(
index: IndexEntry[],
trimmedQuery: string,
): LexicalMatch[] {
if (trimmedQuery.length === 0) return [];

const broadMatches = lexicalSearch(index, trimmedQuery, {
limit: AI_CANDIDATE_LIMIT,
minLength: 1,
});
if (broadMatches.length > 0) return broadMatches;
const candidates: LexicalMatch[] = [];
const seenDigests = new Set<string>();

appendUniqueMatches(
candidates,
seenDigests,
lexicalSearch(index, trimmedQuery, {
limit: AI_LEXICAL_CANDIDATE_LIMIT,
minLength: 1,
}),
);

appendUniqueMatches(
candidates,
seenDigests,
buildSourceDiverseBrowseMatches(index),
);

const sortedIndex = [...index].sort((a, b) => a.name.localeCompare(b.name));
appendUniqueMatches(
candidates,
seenDigests,
sampleEvenly(sortedIndex, AI_CANDIDATE_LIMIT).map(indexEntryToLexicalMatch),

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤖 This is an AI-generated code review comment.

[MEDIUM] The source-diversity layer (buildSourceDiverseBrowseMatches) and this alphabetical-fill layer run unconditionally, padding the AI candidate pool toward the cap (AI_CANDIDATE_LIMIT = 80) with alphabetically-early, lexically-irrelevant components even when lexical search already returned a strong source-spanning set.

This is not a correctness bug — lexical hits are preserved first (appended before the fill layers), and RERANK_EXCLUSION_THRESHOLD keeps junk from being badged. But it sends more low-signal candidates to the billed reranker on every rerank (cost/latency), and can surface irrelevant items in the unbadged tail.

Optional fix: only run the fill layer when the pool is under a smaller floor, or skip the alphabetical fill when lexical + diversity already produced a source-diverse set. Worth confirming reranker cost at 80 vs 50 candidates is acceptable.

);

return [...index]
.sort((a, b) => a.name.localeCompare(b.name))
.slice(0, AI_CANDIDATE_LIMIT)
.map(indexEntryToLexicalMatch);
return candidates;
}

export function buildRerankScoreByDigest(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,9 +215,7 @@ export function useComponentSearchV2State(

if (candidates.length === 0) return;

setRerankBaseMatches(
lexicalMatches.length > 0 ? lexicalMatches : aiCandidateMatches,
);
setRerankBaseMatches(aiCandidateMatches);
setRerankedFor(trimmedQuery);
// Score every candidate so each displayed result shows a relevance %.
mutate({ query: trimmedQuery, candidates, scoreAllCandidates: true });
Expand Down
Loading