From e7b76a8cddd99d6db8e5183ef60b9c9f8943c235 Mon Sep 17 00:00:00 2001
From: mbeaulne <matt.beaulne@gmail.com>
Date: Thu, 18 Jun 2026 12:55:10 -0400
Subject: [PATCH 1/2] Normalize component search tokens for better matching

---
 src/services/componentSearchIndex.test.ts | 57 +++++++++++++--
 src/services/componentSearchIndex.ts      | 85 +++++++++++++++++++----
 2 files changed, 125 insertions(+), 17 deletions(-)

diff --git a/src/services/componentSearchIndex.test.ts b/src/services/componentSearchIndex.test.ts
index 72a6974f8..65c22eaab 100644
--- a/src/services/componentSearchIndex.test.ts
+++ b/src/services/componentSearchIndex.test.ts
@@ -236,10 +236,59 @@ describe("lexicalSearch", () => {
     expect(results[0]?.digest).toBe("contiguous");
   });
 
-  it("tokenizes snake_case queries so each segment matches independently", () => {
-    const index = buildSearchIndex(fixtures);
-    const results = lexicalSearch(index, "drop nulls");
-    expect(results[0]?.digest).toBe("b");
+  it("normalizes snake_case, kebab-case, and camelCase text", () => {
+    const index = buildSearchIndex([
+      makeSourced({
+        digest: "snake",
+        spec: {
+          name: "drop_nulls",
+          inputs: [],
+          outputs: [],
+          implementation: { container: { image: "x" } },
+        },
+      }),
+      makeSourced({
+        digest: "kebab",
+        spec: {
+          name: "train-model",
+          inputs: [],
+          outputs: [],
+          implementation: { container: { image: "x" } },
+        },
+      }),
+      makeSourced({
+        digest: "camel",
+        spec: {
+          name: "loadCSVFile",
+          inputs: [],
+          outputs: [],
+          implementation: { container: { image: "x" } },
+        },
+      }),
+    ]);
+
+    expect(lexicalSearch(index, "drop nulls")[0]?.digest).toBe("snake");
+    expect(lexicalSearch(index, "train model")[0]?.digest).toBe("kebab");
+    expect(lexicalSearch(index, "load csv file")[0]?.digest).toBe("camel");
+  });
+
+  it("normalizes plurals and simple stemmed terms", () => {
+    const index = buildSearchIndex([
+      makeSourced({
+        digest: "normalize",
+        spec: {
+          name: "train_model",
+          description: "Train models on a dataset with labeled batches.",
+          inputs: [],
+          outputs: [],
+          implementation: { container: { image: "x" } },
+        },
+      }),
+    ]);
+
+    expect(lexicalSearch(index, "training")[0]?.digest).toBe("normalize");
+    expect(lexicalSearch(index, "datasets")[0]?.digest).toBe("normalize");
+    expect(lexicalSearch(index, "batch")[0]?.digest).toBe("normalize");
   });
 
   it("ignores natural-language filler words that would otherwise swamp intent", () => {
diff --git a/src/services/componentSearchIndex.ts b/src/services/componentSearchIndex.ts
index ac7be68ed..2f21868cf 100644
--- a/src/services/componentSearchIndex.ts
+++ b/src/services/componentSearchIndex.ts
@@ -54,7 +54,7 @@ export interface IndexEntry {
   name: string;
   /** Where this component came from. */
   source: ComponentSearchSource;
-  /** Pre-lowercased searchable text, one per logical field. */
+  /** Normalized searchable text, one per logical field. */
   searchable: Record<MatchField, string>;
 }
 
@@ -113,6 +113,62 @@ function stringifySearchValue(value: unknown): string {
   }
 }
 
+function splitIdentifierText(text: string): string {
+  return text
+    .replace(/([A-Z]+)([A-Z][a-z])/g, "$1 $2")
+    .replace(/([a-z0-9])([A-Z])/g, "$1 $2")
+    .replace(/[_-]+/g, " ");
+}
+
+function removeSuffixAndCollapseDoubleFinal(
+  token: string,
+  suffixLength: number,
+): string {
+  const stemmed = token.slice(0, -suffixLength);
+  if (stemmed.length < 3) return stemmed;
+
+  const last = stemmed.at(-1);
+  const previous = stemmed.at(-2);
+  return last && last === previous ? stemmed.slice(0, -1) : stemmed;
+}
+
+function stemToken(token: string): string {
+  if (token.length <= 3) return token;
+  if (token.endsWith("ies") && token.length > 4) {
+    return `${token.slice(0, -3)}y`;
+  }
+  if (token.endsWith("ing") && token.length > 5) {
+    return removeSuffixAndCollapseDoubleFinal(token, 3);
+  }
+  if (token.endsWith("ed") && token.length > 4) {
+    return removeSuffixAndCollapseDoubleFinal(token, 2);
+  }
+  if (/(ches|shes|xes|zes|ses)$/.test(token) && token.length > 4) {
+    return token.slice(0, -2);
+  }
+  if (token.endsWith("s") && !token.endsWith("ss") && token.length > 3) {
+    return token.slice(0, -1);
+  }
+  return token;
+}
+
+function normalizeSearchText(text: string): string {
+  const splitText = splitIdentifierText(text).toLowerCase();
+  const tokens = splitText.split(/[^a-z0-9]+/).filter(isNonEmptyString);
+  const expandedTokens: string[] = [];
+  const seen = new Set<string>();
+
+  for (const token of tokens) {
+    for (const variant of [token, stemToken(token)]) {
+      if (seen.has(variant)) continue;
+      seen.add(variant);
+      expandedTokens.push(variant);
+    }
+  }
+
+  return [text.toLowerCase(), splitText, expandedTokens.join(" ")].join(" ");
+}
+
 function extractAnnotationsText(
   annotations: Record<string, unknown> | undefined,
 ): string {
@@ -243,14 +299,17 @@ export function buildSearchIndex(sourced: SourcedReference[]): IndexEntry[] {
       name: metadata.name,
       source,
       searchable: {
-        name: metadata.name.toLowerCase(),
-        description: metadata.description.toLowerCase(),
-        io: metadata.ioText.toLowerCase(),
-        implementation: extractImplementationText(reference),
-        metadata: [metadata.metadataText, source.label, reference.published_by]
-          .filter(isNonEmptyString)
-          .join(" ")
-          .toLowerCase(),
+        name: normalizeSearchText(metadata.name),
+        description: normalizeSearchText(metadata.description),
+        io: normalizeSearchText(metadata.ioText),
+        implementation: normalizeSearchText(
+          extractImplementationText(reference),
+        ),
+        metadata: normalizeSearchText(
+          [metadata.metadataText, source.label, reference.published_by]
+            .filter(isNonEmptyString)
+            .join(" "),
+        ),
       },
     });
   }
@@ -288,10 +347,9 @@ const QUERY_STOP_WORDS = new Set([
  * nearly every component and drowning out the useful intent terms.
  */
 function tokenize(text: string): string[] {
-  const rawTokens = text
-    .toLowerCase()
+  const rawTokens = normalizeSearchText(text)
     .split(/[^a-z0-9]+/)
-    .filter((t) => t.length > 0);
+    .filter(isNonEmptyString);
 
   const tokens: string[] = [];
   const seen = new Set<string>();
@@ -346,7 +404,8 @@ interface SearchOptions {
  *   "train test split" matching `train_test_split` strongly even though we
  *   tokenized.
  *
- * We deliberately do not normalize — raw scores are only used for ordering.
+ * Indexed text and query text are normalized before scoring; raw scores are
+ * only used for ordering.
  */
 function scoreEntry(
   entry: IndexEntry,

From 0494d71681fed0e0d7889d0f44fce6bca1781f94 Mon Sep 17 00:00:00 2001
From: mbeaulne <matt.beaulne@gmail.com>
Date: Thu, 18 Jun 2026 15:08:35 -0400
Subject: [PATCH 2/2] address pr feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- splitIdentifierText: anchor first capital group to a single char to remove
  O(n²) regex backtracking on long uppercase runs (behavior-preserving)
- stemToken: guard -is/-us endings so status/analysis/axis aren't over-stemmed

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/services/componentSearchIndex.ts | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/services/componentSearchIndex.ts b/src/services/componentSearchIndex.ts
index 2f21868cf..2ce262b07 100644
--- a/src/services/componentSearchIndex.ts
+++ b/src/services/componentSearchIndex.ts
@@ -115,7 +115,7 @@ function stringifySearchValue(value: unknown): string {
 
 function splitIdentifierText(text: string): string {
   return text
-    .replace(/([A-Z]+)([A-Z][a-z])/g, "$1 $2")
+    .replace(/([A-Z])([A-Z][a-z])/g, "$1 $2")
     .replace(/([a-z0-9])([A-Z])/g, "$1 $2")
     .replace(/[_-]+/g, " ");
 }
@@ -146,7 +146,13 @@ function stemToken(token: string): string {
   if (/(ches|shes|xes|zes|ses)$/.test(token) && token.length > 4) {
     return token.slice(0, -2);
   }
-  if (token.endsWith("s") && !token.endsWith("ss") && token.length > 3) {
+  if (
+    token.endsWith("s") &&
+    !token.endsWith("ss") &&
+    !token.endsWith("is") &&
+    !token.endsWith("us") &&
+    token.length > 3
+  ) {
     return token.slice(0, -1);
   }
   return token;