diff --git a/src/services/componentSearchIndex.test.ts b/src/services/componentSearchIndex.test.ts index ad589f423..fb541c9b5 100644 --- a/src/services/componentSearchIndex.test.ts +++ b/src/services/componentSearchIndex.test.ts @@ -291,6 +291,111 @@ describe("lexicalSearch", () => { expect(lexicalSearch(index, "batch")[0]?.digest).toBe("normalize"); }); + it("boosts prefix matches for partial search terms", () => { + // Both components match "classif" in the same (name) field, so the only + // thing that can separate them is the prefix bonus. The substring candidate + // also sorts first alphabetically, so without the bonus it would win the + // tie-break — making this test fail if PREFIX_MATCH_BONUS_MULTIPLIER were 0. + const index = buildSearchIndex([ + makeSourced({ + digest: "prefix", + spec: { + name: "classify_rows", // "classify" is a true word-prefix of "classif" + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "substring", + spec: { + // "reclassify" contains "classif" mid-word (never as a prefix), and + // "alpha_…" sorts before "classify_rows". + name: "alpha_reclassify", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + ]); + + expect(lexicalSearch(index, "classif")[0]?.digest).toBe("prefix"); + }); + + it("boosts rare tokens over common tokens", () => { + // Both candidates contain BOTH query tokens (so the all-tokens bonus applies + // equally) and differ only in which token sits in the high-weight name + // field. The filler components make "model" common and "xgboost" rare, so + // only the IDF weighting can make the rare-token-in-name candidate win — the + // test fails if rare-token weighting is removed (then it ties and the + // alphabetically-earlier "common-in-name" wins instead). + const filler = ["m1", "m2", "m3"].map((digest) => + makeSourced({ + digest, + spec: { + name: `${digest}_model`, + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + ); + const index = buildSearchIndex([ + ...filler, + makeSourced({ + digest: "common-in-name", + spec: { + name: "train_model", + description: "Uses xgboost.", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "rare-in-name", + spec: { + name: "xgboost_runner", + description: "Builds a model.", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + ]); + + expect(lexicalSearch(index, "model xgboost")[0]?.digest).toBe( + "rare-in-name", + ); + }); + + it("boosts matches that include every query token across fields", () => { + const index = buildSearchIndex([ + makeSourced({ + digest: "partial", + spec: { + name: "a_train_task", + description: "Train something.", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "complete", + spec: { + name: "z_train_task", + description: "Produces a model artifact.", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + ]); + + expect(lexicalSearch(index, "train model")[0]?.digest).toBe("complete"); + }); + it("expands domain-neutral synonyms", () => { const index = buildSearchIndex([ makeSourced({ diff --git a/src/services/componentSearchIndex.ts b/src/services/componentSearchIndex.ts index c78052db1..5c8dc4cad 100644 --- a/src/services/componentSearchIndex.ts +++ b/src/services/componentSearchIndex.ts @@ -364,18 +364,28 @@ const QUERY_STOP_WORDS = new Set([ * prevents common tokens like "a"/"to" from matching nearly every component and * drowning out the useful intent terms. */ -function filterQueryTokens(rawTokens: string[]): string[] { - const tokens: string[] = []; +function uniqueTokens(tokens: string[]): string[] { + const unique: string[] = []; const seen = new Set(); - for (const token of rawTokens) { - if (QUERY_STOP_WORDS.has(token)) continue; - if (!seen.has(token)) { - tokens.push(token); - seen.add(token); - } + for (const token of tokens) { + if (QUERY_STOP_WORDS.has(token) || seen.has(token)) continue; + seen.add(token); + unique.push(token); } + return unique; +} - return tokens; +function tokenize(text: string): string[] { + return uniqueTokens(expandSynonymTokens(baseSearchTokens(text))); +} + +function requiredQueryTokens(text: string): string[] { + const rawTokens = splitIdentifierText(text) + .toLowerCase() + .split(/[^a-z0-9]+/) + .filter(isNonEmptyString) + .map(stemToken); + return uniqueTokens(rawTokens); } /** @@ -391,6 +401,17 @@ const FIELD_WEIGHTS: Record = { metadata: 1, }; +const FIELD_PHRASE_BONUS: Record = { + name: 10, + description: 4, + io: 4, + implementation: 2, + metadata: 2, +}; + +const PREFIX_MATCH_BONUS_MULTIPLIER = 0.5; +const ALL_QUERY_TOKENS_BONUS = 6; + const SEARCH_FIELDS: MatchField[] = [ "name", "description", @@ -409,14 +430,40 @@ interface SearchOptions { minLength?: number; } +function searchableTokens(text: string): string[] { + return text.split(/[^a-z0-9]+/).filter(isNonEmptyString); +} + +function entryMatchesToken(entry: IndexEntry, token: string): boolean { + return SEARCH_FIELDS.some((field) => entry.searchable[field].includes(token)); +} + +function buildRareTokenWeights( + index: IndexEntry[], + tokens: string[], +): Map { + const weights = new Map(); + for (const token of tokens) { + const documentFrequency = index.filter((entry) => + entryMatchesToken(entry, token), + ).length; + const inverseFrequency = Math.log( + (index.length + 1) / (documentFrequency + 1), + ); + weights.set(token, 1 + Math.max(0, inverseFrequency)); + } + return weights; +} + /** * Score one entry against the tokenized query. Returns 0 if no field matched. * * Scoring model: * - Per query token: each field that contains the token contributes its weight. - * - Bonus: full multi-token query as a substring of the name (+10). Catches - * "train test split" matching `train_test_split` strongly even though we - * tokenized. + * - Word-boundary matches (a query token that begins an indexed token, + * including exact matches) get a small extra boost, useful for partial names. + * - Rare query tokens count more than tokens that match many components. + * - Contiguous multi-token phrase matches and all-token matches get bonuses. * * Indexed text and query text are normalized before scoring; raw scores are * only used for ordering. @@ -424,33 +471,63 @@ interface SearchOptions { function scoreEntry( entry: IndexEntry, tokens: string[], - phraseTokens: string[], + requiredTokens: string[], + tokenWeights: Map, ): { score: number; matchedFields: MatchField[] } { const matched = new Set(); let score = 0; + // A field's tokenization depends only on the field, not the query token, so + // split it once per entry (cached) rather than re-splitting inside the + // per-query-token loop — that re-split is hot on every keystroke. + const fieldTokenCache = new Map(); + const fieldTokensFor = (field: MatchField): string[] => { + const cached = fieldTokenCache.get(field); + if (cached) return cached; + const fieldTokens = searchableTokens(entry.searchable[field]); + fieldTokenCache.set(field, fieldTokens); + return fieldTokens; + }; + for (const token of tokens) { + const tokenWeight = tokenWeights.get(token) ?? 1; for (const field of SEARCH_FIELDS) { - if (entry.searchable[field].includes(token)) { - score += FIELD_WEIGHTS[field]; - matched.add(field); + const fieldText = entry.searchable[field]; + if (!fieldText.includes(token)) continue; + + const fieldWeight = FIELD_WEIGHTS[field]; + score += fieldWeight * tokenWeight; + matched.add(field); + + const hasPrefixMatch = fieldTokensFor(field).some((fieldToken) => + fieldToken.startsWith(token), + ); + if (hasPrefixMatch) { + score += fieldWeight * PREFIX_MATCH_BONUS_MULTIPLIER * tokenWeight; } } } - // Multi-token contiguous match in the name is a very strong signal. Both - // sides are normalized so the bonus also fires for snake_case names — - // query "train test split" should match `train_test_split`, not just - // names that happen to contain literal spaces. - if (phraseTokens.length > 1) { - const normalizedName = entry.searchable.name.replace(/[^a-z0-9]+/g, " "); - const normalizedQuery = phraseTokens.join(" "); - if (normalizedName.includes(normalizedQuery)) { - score += 10; - matched.add("name"); + if (requiredTokens.length > 1) { + const normalizedQuery = requiredTokens.join(" "); + for (const field of SEARCH_FIELDS) { + const normalizedField = entry.searchable[field].replace( + /[^a-z0-9]+/g, + " ", + ); + if (!normalizedField.includes(normalizedQuery)) continue; + score += FIELD_PHRASE_BONUS[field]; + matched.add(field); } } + if ( + requiredTokens.length > 1 && + requiredTokens.every((token) => entryMatchesToken(entry, token)) + ) { + score += ALL_QUERY_TOKENS_BONUS; + } + return { score, matchedFields: [...matched] }; } @@ -468,16 +545,19 @@ export function lexicalSearch( const trimmed = query.trim().toLowerCase(); if (trimmed.length < minLength) return []; - const baseTokens = baseSearchTokens(trimmed); - const tokens = filterQueryTokens(expandSynonymTokens(baseTokens)); + const tokens = tokenize(trimmed); if (tokens.length === 0) return []; - const phraseTokens = baseTokens.filter( - (token) => !QUERY_STOP_WORDS.has(token), - ); + const requiredTokens = requiredQueryTokens(trimmed); + const tokenWeights = buildRareTokenWeights(index, tokens); const scored: Array = []; for (const entry of index) { - const { score, matchedFields } = scoreEntry(entry, tokens, phraseTokens); + const { score, matchedFields } = scoreEntry( + entry, + tokens, + requiredTokens, + tokenWeights, + ); if (score === 0) continue; scored.push({ reference: entry.reference,