From f5a29c09b736a85a46821448b3f89fd54e2f4171 Mon Sep 17 00:00:00 2001
From: mbeaulne <matt.beaulne@gmail.com>
Date: Thu, 18 Jun 2026 13:05:29 -0400
Subject: [PATCH] add synonym groups

---
 src/services/componentSearchIndex.test.ts | 50 +++++++++++++++++++++++
 src/services/componentSearchIndex.ts      | 43 ++++++++++++-------
 src/services/componentSearchSynonyms.ts   | 41 +++++++++++++++++++
 3 files changed, 119 insertions(+), 15 deletions(-)
 create mode 100644 src/services/componentSearchSynonyms.ts

diff --git a/src/services/componentSearchIndex.test.ts b/src/services/componentSearchIndex.test.ts
index 65c22eaab..ad589f423 100644
--- a/src/services/componentSearchIndex.test.ts
+++ b/src/services/componentSearchIndex.test.ts
@@ -291,6 +291,56 @@ describe("lexicalSearch", () => {
     expect(lexicalSearch(index, "batch")[0]?.digest).toBe("normalize");
   });
 
+  it("expands domain-neutral synonyms", () => {
+    const index = buildSearchIndex([
+      makeSourced({
+        digest: "storage",
+        spec: {
+          name: "upload_object",
+          description: "Upload files to a cloud storage bucket.",
+          inputs: [],
+          outputs: [],
+          implementation: { container: { image: "x" } },
+        },
+      }),
+      makeSourced({
+        digest: "train",
+        spec: {
+          name: "train_model",
+          description: "Train a model on tabular data.",
+          inputs: [],
+          outputs: [],
+          implementation: { container: { image: "x" } },
+        },
+      }),
+      makeSourced({
+        digest: "predict",
+        spec: {
+          name: "predict_labels",
+          description: "Predict labels for examples.",
+          inputs: [],
+          outputs: [],
+          implementation: { container: { image: "x" } },
+        },
+      }),
+      makeSourced({
+        digest: "table",
+        spec: {
+          name: "clean_table",
+          description: "Clean tabular dataframe rows.",
+          inputs: [],
+          outputs: [],
+          implementation: { container: { image: "x" } },
+        },
+      }),
+    ]);
+
+    expect(lexicalSearch(index, "gcs")[0]?.digest).toBe("storage");
+    expect(lexicalSearch(index, "fit")[0]?.digest).toBe("train");
+    expect(lexicalSearch(index, "infer")[0]?.digest).toBe("predict");
+    expect(lexicalSearch(index, "df")[0]?.digest).toBe("table");
+  });
+
   it("ignores natural-language filler words that would otherwise swamp intent", () => {
     const index = buildSearchIndex([
       makeSourced({
diff --git a/src/services/componentSearchIndex.ts b/src/services/componentSearchIndex.ts
index 2ce262b07..c78052db1 100644
--- a/src/services/componentSearchIndex.ts
+++ b/src/services/componentSearchIndex.ts
@@ -15,6 +15,8 @@
 import type { ComponentReference } from "@/utils/componentSpec";
 import { getComponentName } from "@/utils/getComponentName";
 
+import { expandSynonymTokens } from "./componentSearchSynonyms";
+
 /** Which field of a component matched the query. Surfaced in the UI. */
 export type MatchField =
   | "name"
@@ -158,7 +160,7 @@ function stemToken(token: string): string {
   return token;
 }
 
-function normalizeSearchText(text: string): string {
+function baseSearchTokens(text: string): string[] {
   const splitText = splitIdentifierText(text).toLowerCase();
   const tokens = splitText.split(/[^a-z0-9]+/).filter(isNonEmptyString);
   const expandedTokens: string[] = [];
@@ -172,7 +174,17 @@ function normalizeSearchText(text: string): string {
     }
   }
 
-  return [text.toLowerCase(), splitText, expandedTokens.join(" ")].join(" ");
+  return expandedTokens;
+}
+
+function normalizeSearchText(text: string): string {
+  const splitText = splitIdentifierText(text).toLowerCase();
+  // Synonym expansion happens on the query side only (see `tokenize`). Expanding
+  // the index too would make a query token set intersect a ballooned index token
+  // set, surfacing components that match neither the literal text nor the intent.
+  return [text.toLowerCase(), splitText, baseSearchTokens(text).join(" ")].join(
+    " ",
+  );
 }
 
 function extractAnnotationsText(
@@ -347,16 +359,12 @@ const QUERY_STOP_WORDS = new Set([
 ]);
 
 /**
- * Split a query into meaningful lowercase alphanumeric tokens. Natural-language
- * searches often include filler words ("I want to upload a component to GCS").
- * Dropping those words prevents common tokens like "a"/"to" from matching
- * nearly every component and drowning out the useful intent terms.
+ * Drop filler words and de-duplicate query tokens. Natural-language searches
+ * often include filler ("I want to upload a component to GCS"); removing those
+ * prevents common tokens like "a"/"to" from matching nearly every component and
+ * drowning out the useful intent terms.
  */
-function tokenize(text: string): string[] {
-  const rawTokens = normalizeSearchText(text)
-    .split(/[^a-z0-9]+/)
-    .filter(isNonEmptyString);
-
+function filterQueryTokens(rawTokens: string[]): string[] {
   const tokens: string[] = [];
   const seen = new Set<string>();
   for (const token of rawTokens) {
@@ -416,6 +424,7 @@ interface SearchOptions {
 function scoreEntry(
   entry: IndexEntry,
   tokens: string[],
+  phraseTokens: string[],
 ): { score: number; matchedFields: MatchField[] } {
   const matched = new Set<MatchField>();
   let score = 0;
@@ -433,9 +442,9 @@ function scoreEntry(
   // sides are normalized so the bonus also fires for snake_case names —
   // query "train test split" should match `train_test_split`, not just
   // names that happen to contain literal spaces.
-  if (tokens.length > 1) {
+  if (phraseTokens.length > 1) {
     const normalizedName = entry.searchable.name.replace(/[^a-z0-9]+/g, " ");
-    const normalizedQuery = tokens.join(" ");
+    const normalizedQuery = phraseTokens.join(" ");
     if (normalizedName.includes(normalizedQuery)) {
       score += 10;
       matched.add("name");
@@ -459,12 +468,16 @@ export function lexicalSearch(
   const trimmed = query.trim().toLowerCase();
   if (trimmed.length < minLength) return [];
 
-  const tokens = tokenize(trimmed);
+  const baseTokens = baseSearchTokens(trimmed);
+  const tokens = filterQueryTokens(expandSynonymTokens(baseTokens));
   if (tokens.length === 0) return [];
+  const phraseTokens = baseTokens.filter(
+    (token) => !QUERY_STOP_WORDS.has(token),
+  );
 
   const scored: Array<LexicalMatch & { score: number }> = [];
   for (const entry of index) {
-    const { score, matchedFields } = scoreEntry(entry, tokens);
+    const { score, matchedFields } = scoreEntry(entry, tokens, phraseTokens);
     if (score === 0) continue;
     scored.push({
       reference: entry.reference,
diff --git a/src/services/componentSearchSynonyms.ts b/src/services/componentSearchSynonyms.ts
new file mode 100644
index 000000000..fc373ac24
--- /dev/null
+++ b/src/services/componentSearchSynonyms.ts
@@ -0,0 +1,41 @@
+const SYNONYM_GROUPS = [
+  ["gcs", "storage", "bucket", "object storage", "cloud storage"],
+  ["train", "fit", "training", "trainer"],
+  ["predict", "infer", "inference", "score"],
+  ["df", "dataframe", "data frame", "table"],
+  ["csv", "comma separated", "tabular"],
+  ["embed", "embedding", "vectorize", "vector"],
+  ["llm", "language model", "chat model"],
+] as const;
+
+const SYNONYM_TOKENS_BY_TOKEN = new Map<string, string[]>();
+
+for (const group of SYNONYM_GROUPS) {
+  // Only single-word aliases are usable as expansion keys: queries are matched
+  // token-by-token, so a multi-word phrase ("data frame") can never match as a
+  // unit, and fragmenting it would turn common words ("data", "model") into
+  // keys that cross-link unrelated components.
+  const singleWordTerms = group.filter((term) => /^[a-z0-9]+$/.test(term));
+
+  for (const token of singleWordTerms) {
+    SYNONYM_TOKENS_BY_TOKEN.set(token, singleWordTerms);
+  }
+}
+
+export function expandSynonymTokens(tokens: string[]): string[] {
+  const expanded: string[] = [];
+  const seen = new Set<string>();
+
+  for (const token of tokens) {
+    for (const variant of [
+      token,
+      ...(SYNONYM_TOKENS_BY_TOKEN.get(token) ?? []),
+    ]) {
+      if (seen.has(variant)) continue;
+      seen.add(variant);
+      expanded.push(variant);
+    }
+  }
+
+  return expanded;
+}