diff --git a/src/services/componentSearchIndex.test.ts b/src/services/componentSearchIndex.test.ts index fb541c9b5..18491bc6e 100644 --- a/src/services/componentSearchIndex.test.ts +++ b/src/services/componentSearchIndex.test.ts @@ -322,6 +322,58 @@ describe("lexicalSearch", () => { expect(lexicalSearch(index, "classif")[0]?.digest).toBe("prefix"); }); + it("applies typo tolerance to component names and input/output fields", () => { + const index = buildSearchIndex([ + makeSourced({ + digest: "name-typo", + spec: { + name: "filter_rows", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "io-typo", + spec: { + name: "prepare_data", + inputs: [{ name: "dataset" }], + outputs: [{ name: "clean_table" }], + implementation: { container: { image: "x" } }, + }, + }), + ]); + + expect(lexicalSearch(index, "filtr")[0]?.digest).toBe("name-typo"); + expect(lexicalSearch(index, "datset")[0]?.digest).toBe("io-typo"); + }); + + it("does not apply typo tolerance to descriptions or implementation text", () => { + const index = buildSearchIndex([ + makeSourced({ + digest: "description-only", + spec: { + name: "generic_component", + description: "Runs an xgboost classifier.", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "implementation-only", + spec: { + name: "generic_runner", + inputs: [], + outputs: [], + implementation: { container: { image: "python:3.11-xgboost" } }, + }, + }), + ]); + + expect(lexicalSearch(index, "xgbost")).toHaveLength(0); + }); + it("boosts rare tokens over common tokens", () => { // Both candidates contain BOTH query tokens (so the all-tokens bonus applies // equally) and differ only in which token sits in the high-weight name diff --git a/src/services/componentSearchIndex.ts b/src/services/componentSearchIndex.ts index 5c8dc4cad..002f22c6c 100644 --- a/src/services/componentSearchIndex.ts +++ b/src/services/componentSearchIndex.ts @@ -410,6 +410,7 @@ const FIELD_PHRASE_BONUS: Record = { }; const PREFIX_MATCH_BONUS_MULTIPLIER = 0.5; +const FUZZY_MATCH_BONUS_MULTIPLIER = 0.75; const ALL_QUERY_TOKENS_BONUS = 6; const SEARCH_FIELDS: MatchField[] = [ @@ -419,6 +420,7 @@ const SEARCH_FIELDS: MatchField[] = [ "implementation", "metadata", ]; +const FUZZY_SEARCH_FIELDS: MatchField[] = ["name", "io"]; interface SearchOptions { /** Max results to return. Default 20. */ @@ -434,6 +436,58 @@ function searchableTokens(text: string): string[] { return text.split(/[^a-z0-9]+/).filter(isNonEmptyString); } +function maxTypoDistance(token: string): number { + // Require length >= 5 before allowing any edits: 4-char tokens are too short + // for distance-1 fuzziness without false positives on generic IO names + // (data<->date, path<->bath, list<->last). + if (token.length < 5) return 0; + if (token.length < 7) return 1; + return 2; +} + +function isEditDistanceAtMost( + left: string, + right: string, + maxDistance: number, +): boolean { + if (maxDistance === 0) return left === right; + if (Math.abs(left.length - right.length) > maxDistance) return false; + + let previous = Array.from({ length: right.length + 1 }, (_, index) => index); + for (let leftIndex = 1; leftIndex <= left.length; leftIndex++) { + const current = [leftIndex]; + let rowMinimum = current[0]; + + for (let rightIndex = 1; rightIndex <= right.length; rightIndex++) { + const substitutionCost = + left[leftIndex - 1] === right[rightIndex - 1] ? 0 : 1; + const value = Math.min( + previous[rightIndex] + 1, + current[rightIndex - 1] + 1, + previous[rightIndex - 1] + substitutionCost, + ); + current[rightIndex] = value; + rowMinimum = Math.min(rowMinimum, value); + } + + if (rowMinimum > maxDistance) return false; + previous = current; + } + + return previous[right.length] <= maxDistance; +} + +function hasFuzzyTokenMatch(fieldTokens: string[], token: string): boolean { + const maxDistance = maxTypoDistance(token); + if (maxDistance === 0) return false; + return fieldTokens.some( + (fieldToken) => + !fieldToken.includes(token) && + !token.includes(fieldToken) && + isEditDistanceAtMost(token, fieldToken, maxDistance), + ); +} + function entryMatchesToken(entry: IndexEntry, token: string): boolean { return SEARCH_FIELDS.some((field) => entry.searchable[field].includes(token)); } @@ -493,17 +547,27 @@ function scoreEntry( const tokenWeight = tokenWeights.get(token) ?? 1; for (const field of SEARCH_FIELDS) { const fieldText = entry.searchable[field]; - if (!fieldText.includes(token)) continue; - const fieldWeight = FIELD_WEIGHTS[field]; - score += fieldWeight * tokenWeight; - matched.add(field); - const hasPrefixMatch = fieldTokensFor(field).some((fieldToken) => - fieldToken.startsWith(token), - ); - if (hasPrefixMatch) { - score += fieldWeight * PREFIX_MATCH_BONUS_MULTIPLIER * tokenWeight; + if (fieldText.includes(token)) { + score += fieldWeight * tokenWeight; + matched.add(field); + + const hasPrefixMatch = fieldTokensFor(field).some((fieldToken) => + fieldToken.startsWith(token), + ); + if (hasPrefixMatch) { + score += fieldWeight * PREFIX_MATCH_BONUS_MULTIPLIER * tokenWeight; + } + continue; + } + + if ( + FUZZY_SEARCH_FIELDS.includes(field) && + hasFuzzyTokenMatch(fieldTokensFor(field), token) + ) { + score += fieldWeight * FUZZY_MATCH_BONUS_MULTIPLIER * tokenWeight; + matched.add(field); } } }