From e7b76a8cddd99d6db8e5183ef60b9c9f8943c235 Mon Sep 17 00:00:00 2001 From: mbeaulne Date: Thu, 18 Jun 2026 12:55:10 -0400 Subject: [PATCH 1/2] Normalize component search tokens for better matching --- src/services/componentSearchIndex.test.ts | 57 +++++++++++++-- src/services/componentSearchIndex.ts | 85 +++++++++++++++++++---- 2 files changed, 125 insertions(+), 17 deletions(-) diff --git a/src/services/componentSearchIndex.test.ts b/src/services/componentSearchIndex.test.ts index 72a6974f8..65c22eaab 100644 --- a/src/services/componentSearchIndex.test.ts +++ b/src/services/componentSearchIndex.test.ts @@ -236,10 +236,59 @@ describe("lexicalSearch", () => { expect(results[0]?.digest).toBe("contiguous"); }); - it("tokenizes snake_case queries so each segment matches independently", () => { - const index = buildSearchIndex(fixtures); - const results = lexicalSearch(index, "drop nulls"); - expect(results[0]?.digest).toBe("b"); + it("normalizes snake_case, kebab-case, and camelCase text", () => { + const index = buildSearchIndex([ + makeSourced({ + digest: "snake", + spec: { + name: "drop_nulls", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "kebab", + spec: { + name: "train-model", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "camel", + spec: { + name: "loadCSVFile", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + ]); + + expect(lexicalSearch(index, "drop nulls")[0]?.digest).toBe("snake"); + expect(lexicalSearch(index, "train model")[0]?.digest).toBe("kebab"); + expect(lexicalSearch(index, "load csv file")[0]?.digest).toBe("camel"); + }); + + it("normalizes plurals and simple stemmed terms", () => { + const index = buildSearchIndex([ + makeSourced({ + digest: "normalize", + spec: { + name: "train_model", + description: "Train models on a dataset with labeled batches.", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + ]); + + expect(lexicalSearch(index, "training")[0]?.digest).toBe("normalize"); + expect(lexicalSearch(index, "datasets")[0]?.digest).toBe("normalize"); + expect(lexicalSearch(index, "batch")[0]?.digest).toBe("normalize"); }); it("ignores natural-language filler words that would otherwise swamp intent", () => { diff --git a/src/services/componentSearchIndex.ts b/src/services/componentSearchIndex.ts index ac7be68ed..2f21868cf 100644 --- a/src/services/componentSearchIndex.ts +++ b/src/services/componentSearchIndex.ts @@ -54,7 +54,7 @@ export interface IndexEntry { name: string; /** Where this component came from. */ source: ComponentSearchSource; - /** Pre-lowercased searchable text, one per logical field. */ + /** Normalized searchable text, one per logical field. */ searchable: Record; } @@ -113,6 +113,62 @@ function stringifySearchValue(value: unknown): string { } } +function splitIdentifierText(text: string): string { + return text + .replace(/([A-Z]+)([A-Z][a-z])/g, "$1 $2") + .replace(/([a-z0-9])([A-Z])/g, "$1 $2") + .replace(/[_-]+/g, " "); +} + +function removeSuffixAndCollapseDoubleFinal( + token: string, + suffixLength: number, +): string { + const stemmed = token.slice(0, -suffixLength); + if (stemmed.length < 3) return stemmed; + + const last = stemmed.at(-1); + const previous = stemmed.at(-2); + return last && last === previous ? stemmed.slice(0, -1) : stemmed; +} + +function stemToken(token: string): string { + if (token.length <= 3) return token; + if (token.endsWith("ies") && token.length > 4) { + return `${token.slice(0, -3)}y`; + } + if (token.endsWith("ing") && token.length > 5) { + return removeSuffixAndCollapseDoubleFinal(token, 3); + } + if (token.endsWith("ed") && token.length > 4) { + return removeSuffixAndCollapseDoubleFinal(token, 2); + } + if (/(ches|shes|xes|zes|ses)$/.test(token) && token.length > 4) { + return token.slice(0, -2); + } + if (token.endsWith("s") && !token.endsWith("ss") && token.length > 3) { + return token.slice(0, -1); + } + return token; +} + +function normalizeSearchText(text: string): string { + const splitText = splitIdentifierText(text).toLowerCase(); + const tokens = splitText.split(/[^a-z0-9]+/).filter(isNonEmptyString); + const expandedTokens: string[] = []; + const seen = new Set(); + + for (const token of tokens) { + for (const variant of [token, stemToken(token)]) { + if (seen.has(variant)) continue; + seen.add(variant); + expandedTokens.push(variant); + } + } + + return [text.toLowerCase(), splitText, expandedTokens.join(" ")].join(" "); +} + function extractAnnotationsText( annotations: Record | undefined, ): string { @@ -243,14 +299,17 @@ export function buildSearchIndex(sourced: SourcedReference[]): IndexEntry[] { name: metadata.name, source, searchable: { - name: metadata.name.toLowerCase(), - description: metadata.description.toLowerCase(), - io: metadata.ioText.toLowerCase(), - implementation: extractImplementationText(reference), - metadata: [metadata.metadataText, source.label, reference.published_by] - .filter(isNonEmptyString) - .join(" ") - .toLowerCase(), + name: normalizeSearchText(metadata.name), + description: normalizeSearchText(metadata.description), + io: normalizeSearchText(metadata.ioText), + implementation: normalizeSearchText( + extractImplementationText(reference), + ), + metadata: normalizeSearchText( + [metadata.metadataText, source.label, reference.published_by] + .filter(isNonEmptyString) + .join(" "), + ), }, }); } @@ -288,10 +347,9 @@ const QUERY_STOP_WORDS = new Set([ * nearly every component and drowning out the useful intent terms. */ function tokenize(text: string): string[] { - const rawTokens = text - .toLowerCase() + const rawTokens = normalizeSearchText(text) .split(/[^a-z0-9]+/) - .filter((t) => t.length > 0); + .filter(isNonEmptyString); const tokens: string[] = []; const seen = new Set(); @@ -346,7 +404,8 @@ interface SearchOptions { * "train test split" matching `train_test_split` strongly even though we * tokenized. * - * We deliberately do not normalize — raw scores are only used for ordering. + * Indexed text and query text are normalized before scoring; raw scores are + * only used for ordering. */ function scoreEntry( entry: IndexEntry, From 0494d71681fed0e0d7889d0f44fce6bca1781f94 Mon Sep 17 00:00:00 2001 From: mbeaulne Date: Thu, 18 Jun 2026 15:08:35 -0400 Subject: [PATCH 2/2] address pr feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - splitIdentifierText: anchor first capital group to a single char to remove O(n²) regex backtracking on long uppercase runs (behavior-preserving) - stemToken: guard -is/-us endings so status/analysis/axis aren't over-stemmed Co-Authored-By: Claude Opus 4.8 (1M context) --- src/services/componentSearchIndex.ts | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/services/componentSearchIndex.ts b/src/services/componentSearchIndex.ts index 2f21868cf..2ce262b07 100644 --- a/src/services/componentSearchIndex.ts +++ b/src/services/componentSearchIndex.ts @@ -115,7 +115,7 @@ function stringifySearchValue(value: unknown): string { function splitIdentifierText(text: string): string { return text - .replace(/([A-Z]+)([A-Z][a-z])/g, "$1 $2") + .replace(/([A-Z])([A-Z][a-z])/g, "$1 $2") .replace(/([a-z0-9])([A-Z])/g, "$1 $2") .replace(/[_-]+/g, " "); } @@ -146,7 +146,13 @@ function stemToken(token: string): string { if (/(ches|shes|xes|zes|ses)$/.test(token) && token.length > 4) { return token.slice(0, -2); } - if (token.endsWith("s") && !token.endsWith("ss") && token.length > 3) { + if ( + token.endsWith("s") && + !token.endsWith("ss") && + !token.endsWith("is") && + !token.endsWith("us") && + token.length > 3 + ) { return token.slice(0, -1); } return token;