From f5a29c09b736a85a46821448b3f89fd54e2f4171 Mon Sep 17 00:00:00 2001 From: mbeaulne Date: Thu, 18 Jun 2026 13:05:29 -0400 Subject: [PATCH] add synonym groups --- src/services/componentSearchIndex.test.ts | 50 +++++++++++++++++++++++ src/services/componentSearchIndex.ts | 43 ++++++++++++------- src/services/componentSearchSynonyms.ts | 41 +++++++++++++++++++ 3 files changed, 119 insertions(+), 15 deletions(-) create mode 100644 src/services/componentSearchSynonyms.ts diff --git a/src/services/componentSearchIndex.test.ts b/src/services/componentSearchIndex.test.ts index 65c22eaab..ad589f423 100644 --- a/src/services/componentSearchIndex.test.ts +++ b/src/services/componentSearchIndex.test.ts @@ -291,6 +291,56 @@ describe("lexicalSearch", () => { expect(lexicalSearch(index, "batch")[0]?.digest).toBe("normalize"); }); + it("expands domain-neutral synonyms", () => { + const index = buildSearchIndex([ + makeSourced({ + digest: "storage", + spec: { + name: "upload_object", + description: "Upload files to a cloud storage bucket.", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "train", + spec: { + name: "train_model", + description: "Train a model on tabular data.", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "predict", + spec: { + name: "predict_labels", + description: "Predict labels for examples.", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "table", + spec: { + name: "clean_table", + description: "Clean tabular dataframe rows.", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + ]); + + expect(lexicalSearch(index, "gcs")[0]?.digest).toBe("storage"); + expect(lexicalSearch(index, "fit")[0]?.digest).toBe("train"); + expect(lexicalSearch(index, "infer")[0]?.digest).toBe("predict"); + expect(lexicalSearch(index, "df")[0]?.digest).toBe("table"); + }); + it("ignores natural-language filler words that would otherwise swamp intent", () => { const index = buildSearchIndex([ makeSourced({ diff --git a/src/services/componentSearchIndex.ts b/src/services/componentSearchIndex.ts index 2ce262b07..c78052db1 100644 --- a/src/services/componentSearchIndex.ts +++ b/src/services/componentSearchIndex.ts @@ -15,6 +15,8 @@ import type { ComponentReference } from "@/utils/componentSpec"; import { getComponentName } from "@/utils/getComponentName"; +import { expandSynonymTokens } from "./componentSearchSynonyms"; + /** Which field of a component matched the query. Surfaced in the UI. */ export type MatchField = | "name" @@ -158,7 +160,7 @@ function stemToken(token: string): string { return token; } -function normalizeSearchText(text: string): string { +function baseSearchTokens(text: string): string[] { const splitText = splitIdentifierText(text).toLowerCase(); const tokens = splitText.split(/[^a-z0-9]+/).filter(isNonEmptyString); const expandedTokens: string[] = []; @@ -172,7 +174,17 @@ function normalizeSearchText(text: string): string { } } - return [text.toLowerCase(), splitText, expandedTokens.join(" ")].join(" "); + return expandedTokens; +} + +function normalizeSearchText(text: string): string { + const splitText = splitIdentifierText(text).toLowerCase(); + // Synonym expansion happens on the query side only (see `tokenize`). Expanding + // the index too would make a query token set intersect a ballooned index token + // set, surfacing components that match neither the literal text nor the intent. + return [text.toLowerCase(), splitText, baseSearchTokens(text).join(" ")].join( + " ", + ); } function extractAnnotationsText( @@ -347,16 +359,12 @@ const QUERY_STOP_WORDS = new Set([ ]); /** - * Split a query into meaningful lowercase alphanumeric tokens. Natural-language - * searches often include filler words ("I want to upload a component to GCS"). - * Dropping those words prevents common tokens like "a"/"to" from matching - * nearly every component and drowning out the useful intent terms. + * Drop filler words and de-duplicate query tokens. Natural-language searches + * often include filler ("I want to upload a component to GCS"); removing those + * prevents common tokens like "a"/"to" from matching nearly every component and + * drowning out the useful intent terms. */ -function tokenize(text: string): string[] { - const rawTokens = normalizeSearchText(text) - .split(/[^a-z0-9]+/) - .filter(isNonEmptyString); - +function filterQueryTokens(rawTokens: string[]): string[] { const tokens: string[] = []; const seen = new Set(); for (const token of rawTokens) { @@ -416,6 +424,7 @@ interface SearchOptions { function scoreEntry( entry: IndexEntry, tokens: string[], + phraseTokens: string[], ): { score: number; matchedFields: MatchField[] } { const matched = new Set(); let score = 0; @@ -433,9 +442,9 @@ function scoreEntry( // sides are normalized so the bonus also fires for snake_case names — // query "train test split" should match `train_test_split`, not just // names that happen to contain literal spaces. - if (tokens.length > 1) { + if (phraseTokens.length > 1) { const normalizedName = entry.searchable.name.replace(/[^a-z0-9]+/g, " "); - const normalizedQuery = tokens.join(" "); + const normalizedQuery = phraseTokens.join(" "); if (normalizedName.includes(normalizedQuery)) { score += 10; matched.add("name"); @@ -459,12 +468,16 @@ export function lexicalSearch( const trimmed = query.trim().toLowerCase(); if (trimmed.length < minLength) return []; - const tokens = tokenize(trimmed); + const baseTokens = baseSearchTokens(trimmed); + const tokens = filterQueryTokens(expandSynonymTokens(baseTokens)); if (tokens.length === 0) return []; + const phraseTokens = baseTokens.filter( + (token) => !QUERY_STOP_WORDS.has(token), + ); const scored: Array = []; for (const entry of index) { - const { score, matchedFields } = scoreEntry(entry, tokens); + const { score, matchedFields } = scoreEntry(entry, tokens, phraseTokens); if (score === 0) continue; scored.push({ reference: entry.reference, diff --git a/src/services/componentSearchSynonyms.ts b/src/services/componentSearchSynonyms.ts new file mode 100644 index 000000000..fc373ac24 --- /dev/null +++ b/src/services/componentSearchSynonyms.ts @@ -0,0 +1,41 @@ +const SYNONYM_GROUPS = [ + ["gcs", "storage", "bucket", "object storage", "cloud storage"], + ["train", "fit", "training", "trainer"], + ["predict", "infer", "inference", "score"], + ["df", "dataframe", "data frame", "table"], + ["csv", "comma separated", "tabular"], + ["embed", "embedding", "vectorize", "vector"], + ["llm", "language model", "chat model"], +] as const; + +const SYNONYM_TOKENS_BY_TOKEN = new Map(); + +for (const group of SYNONYM_GROUPS) { + // Only single-word aliases are usable as expansion keys: queries are matched + // token-by-token, so a multi-word phrase ("data frame") can never match as a + // unit, and fragmenting it would turn common words ("data", "model") into + // keys that cross-link unrelated components. + const singleWordTerms = group.filter((term) => /^[a-z0-9]+$/.test(term)); + + for (const token of singleWordTerms) { + SYNONYM_TOKENS_BY_TOKEN.set(token, singleWordTerms); + } +} + +export function expandSynonymTokens(tokens: string[]): string[] { + const expanded: string[] = []; + const seen = new Set(); + + for (const token of tokens) { + for (const variant of [ + token, + ...(SYNONYM_TOKENS_BY_TOKEN.get(token) ?? []), + ]) { + if (seen.has(variant)) continue; + seen.add(variant); + expanded.push(variant); + } + } + + return expanded; +}