TangleML · Mbeaulne · Jun 18, 2026
@@ -291,6 +291,111 @@ describe("lexicalSearch", () => {
     expect(lexicalSearch(index, "batch")[0]?.digest).toBe("normalize");
   });
 
+  it("boosts prefix matches for partial search terms", () => {
+    // Both components match "classif" in the same (name) field, so the only
+    // thing that can separate them is the prefix bonus. The substring candidate
+    // also sorts first alphabetically, so without the bonus it would win the
+    // tie-break — making this test fail if PREFIX_MATCH_BONUS_MULTIPLIER were 0.
+    const index = buildSearchIndex([
+      makeSourced({
+        digest: "prefix",
+        spec: {
+          name: "classify_rows", // "classify" is a true word-prefix of "classif"
+          inputs: [],
+          outputs: [],
+          implementation: { container: { image: "x" } },
+        },
+      }),
+      makeSourced({
+        digest: "substring",
+        spec: {
+          // "reclassify" contains "classif" mid-word (never as a prefix), and
+          // "alpha_…" sorts before "classify_rows".
+          name: "alpha_reclassify",
+          inputs: [],
+          outputs: [],
+          implementation: { container: { image: "x" } },
+        },
+      }),
+    ]);
+
+    expect(lexicalSearch(index, "classif")[0]?.digest).toBe("prefix");
+  });
+
+  it("boosts rare tokens over common tokens", () => {
+    // Both candidates contain BOTH query tokens (so the all-tokens bonus applies
+    // equally) and differ only in which token sits in the high-weight name
+    // field. The filler components make "model" common and "xgboost" rare, so
+    // only the IDF weighting can make the rare-token-in-name candidate win — the
+    // test fails if rare-token weighting is removed (then it ties and the
+    // alphabetically-earlier "common-in-name" wins instead).
+    const filler = ["m1", "m2", "m3"].map((digest) =>
+      makeSourced({
+        digest,
+        spec: {
+          name: `${digest}_model`,
+          inputs: [],
+          outputs: [],
+          implementation: { container: { image: "x" } },
+        },
+      }),
+    );
+    const index = buildSearchIndex([
+      ...filler,
+      makeSourced({
+        digest: "common-in-name",
+        spec: {
+          name: "train_model",
+          description: "Uses xgboost.",
+          inputs: [],
+          outputs: [],
+          implementation: { container: { image: "x" } },
+        },
+      }),
+      makeSourced({
+        digest: "rare-in-name",
+        spec: {
+          name: "xgboost_runner",
+          description: "Builds a model.",
+          inputs: [],
+          outputs: [],
+          implementation: { container: { image: "x" } },
+        },
+      }),
+    ]);
+
+    expect(lexicalSearch(index, "model xgboost")[0]?.digest).toBe(
+      "rare-in-name",
+    );
+  });
+
+  it("boosts matches that include every query token across fields", () => {
+    const index = buildSearchIndex([
+      makeSourced({
+        digest: "partial",
+        spec: {
+          name: "a_train_task",
+          description: "Train something.",
+          inputs: [],
+          outputs: [],
+          implementation: { container: { image: "x" } },
+        },
+      }),
+      makeSourced({
+        digest: "complete",
+        spec: {
+          name: "z_train_task",
+          description: "Produces a model artifact.",
+          inputs: [],
+          outputs: [],
+          implementation: { container: { image: "x" } },
+        },
+      }),
+    ]);
+
+    expect(lexicalSearch(index, "train model")[0]?.digest).toBe("complete");
+  });
+
   it("expands domain-neutral synonyms", () => {
     const index = buildSearchIndex([
       makeSourced({

@@ -364,18 +364,28 @@ const QUERY_STOP_WORDS = new Set([
  * prevents common tokens like "a"/"to" from matching nearly every component and
  * drowning out the useful intent terms.
  */
-function filterQueryTokens(rawTokens: string[]): string[] {
-  const tokens: string[] = [];
+function uniqueTokens(tokens: string[]): string[] {
+  const unique: string[] = [];
   const seen = new Set<string>();
-  for (const token of rawTokens) {
-    if (QUERY_STOP_WORDS.has(token)) continue;
-    if (!seen.has(token)) {
-      tokens.push(token);
-      seen.add(token);
-    }
+  for (const token of tokens) {
+    if (QUERY_STOP_WORDS.has(token) || seen.has(token)) continue;
+    seen.add(token);
+    unique.push(token);
   }
+  return unique;
+}
 
-  return tokens;
+function tokenize(text: string): string[] {
+  return uniqueTokens(expandSynonymTokens(baseSearchTokens(text)));
+}
+
+function requiredQueryTokens(text: string): string[] {
+  const rawTokens = splitIdentifierText(text)
+    .toLowerCase()
+    .split(/[^a-z0-9]+/)
+    .filter(isNonEmptyString)
+    .map(stemToken);
+  return uniqueTokens(rawTokens);
 }
 
 /**
@@ -391,6 +401,17 @@ const FIELD_WEIGHTS: Record<MatchField, number> = {
   metadata: 1,
 };
 
+const FIELD_PHRASE_BONUS: Record<MatchField, number> = {
+  name: 10,
+  description: 4,
+  io: 4,
+  implementation: 2,
+  metadata: 2,
+};
+
+const PREFIX_MATCH_BONUS_MULTIPLIER = 0.5;
+const ALL_QUERY_TOKENS_BONUS = 6;
+
 const SEARCH_FIELDS: MatchField[] = [
   "name",
   "description",
@@ -409,48 +430,104 @@ interface SearchOptions {
   minLength?: number;
 }
 
+function searchableTokens(text: string): string[] {
+  return text.split(/[^a-z0-9]+/).filter(isNonEmptyString);
+}
+
+function entryMatchesToken(entry: IndexEntry, token: string): boolean {
+  return SEARCH_FIELDS.some((field) => entry.searchable[field].includes(token));
+}
+
+function buildRareTokenWeights(
+  index: IndexEntry[],
+  tokens: string[],
+): Map<string, number> {
+  const weights = new Map<string, number>();
+  for (const token of tokens) {
+    const documentFrequency = index.filter((entry) =>
+      entryMatchesToken(entry, token),
+    ).length;
+    const inverseFrequency = Math.log(
+      (index.length + 1) / (documentFrequency + 1),
+    );
+    weights.set(token, 1 + Math.max(0, inverseFrequency));
+  }
+  return weights;
+}
+
 /**
  * Score one entry against the tokenized query. Returns 0 if no field matched.
  *
  * Scoring model:
  * - Per query token: each field that contains the token contributes its weight.
- * - Bonus: full multi-token query as a substring of the name (+10). Catches
- *   "train test split" matching `train_test_split` strongly even though we
- *   tokenized.
+ * - Word-boundary matches (a query token that begins an indexed token,
+ *   including exact matches) get a small extra boost, useful for partial names.
+ * - Rare query tokens count more than tokens that match many components.
+ * - Contiguous multi-token phrase matches and all-token matches get bonuses.
  *
  * Indexed text and query text are normalized before scoring; raw scores are
  * only used for ordering.
  */
 function scoreEntry(
   entry: IndexEntry,
   tokens: string[],
-  phraseTokens: string[],
+  requiredTokens: string[],
+  tokenWeights: Map<string, number>,
 ): { score: number; matchedFields: MatchField[] } {
   const matched = new Set<MatchField>();
   let score = 0;
 
+  // A field's tokenization depends only on the field, not the query token, so
+  // split it once per entry (cached) rather than re-splitting inside the
+  // per-query-token loop — that re-split is hot on every keystroke.
+  const fieldTokenCache = new Map<MatchField, string[]>();
+  const fieldTokensFor = (field: MatchField): string[] => {
+    const cached = fieldTokenCache.get(field);
+    if (cached) return cached;
+    const fieldTokens = searchableTokens(entry.searchable[field]);
+    fieldTokenCache.set(field, fieldTokens);
+    return fieldTokens;
+  };
+
   for (const token of tokens) {
+    const tokenWeight = tokenWeights.get(token) ?? 1;
     for (const field of SEARCH_FIELDS) {
-      if (entry.searchable[field].includes(token)) {
-        score += FIELD_WEIGHTS[field];
-        matched.add(field);
+      const fieldText = entry.searchable[field];
+      if (!fieldText.includes(token)) continue;
+
+      const fieldWeight = FIELD_WEIGHTS[field];
+      score += fieldWeight * tokenWeight;
+      matched.add(field);
+
+      const hasPrefixMatch = fieldTokensFor(field).some((fieldToken) =>
+        fieldToken.startsWith(token),
+      );
+      if (hasPrefixMatch) {
+        score += fieldWeight * PREFIX_MATCH_BONUS_MULTIPLIER * tokenWeight;
       }
     }
   }
 
-  // Multi-token contiguous match in the name is a very strong signal. Both
-  // sides are normalized so the bonus also fires for snake_case names —
-  // query "train test split" should match `train_test_split`, not just
-  // names that happen to contain literal spaces.
-  if (phraseTokens.length > 1) {
-    const normalizedName = entry.searchable.name.replace(/[^a-z0-9]+/g, " ");
-    const normalizedQuery = phraseTokens.join(" ");
-    if (normalizedName.includes(normalizedQuery)) {
-      score += 10;
-      matched.add("name");
+  if (requiredTokens.length > 1) {
+    const normalizedQuery = requiredTokens.join(" ");
+    for (const field of SEARCH_FIELDS) {
+      const normalizedField = entry.searchable[field].replace(
+        /[^a-z0-9]+/g,
+        " ",
+      );
+      if (!normalizedField.includes(normalizedQuery)) continue;
+      score += FIELD_PHRASE_BONUS[field];
+      matched.add(field);
     }
   }
 
+  if (
+    requiredTokens.length > 1 &&
+    requiredTokens.every((token) => entryMatchesToken(entry, token))
+  ) {
+    score += ALL_QUERY_TOKENS_BONUS;
+  }
+
   return { score, matchedFields: [...matched] };
 }
 
@@ -468,16 +545,19 @@ export function lexicalSearch(
   const trimmed = query.trim().toLowerCase();
   if (trimmed.length < minLength) return [];
 
-  const baseTokens = baseSearchTokens(trimmed);
-  const tokens = filterQueryTokens(expandSynonymTokens(baseTokens));
+  const tokens = tokenize(trimmed);
   if (tokens.length === 0) return [];
-  const phraseTokens = baseTokens.filter(
-    (token) => !QUERY_STOP_WORDS.has(token),
-  );
+  const requiredTokens = requiredQueryTokens(trimmed);
+  const tokenWeights = buildRareTokenWeights(index, tokens);
 
   const scored: Array<LexicalMatch & { score: number }> = [];
   for (const entry of index) {
-    const { score, matchedFields } = scoreEntry(entry, tokens, phraseTokens);
+    const { score, matchedFields } = scoreEntry(
+      entry,
+      tokens,
+      requiredTokens,
+      tokenWeights,
+    );
     if (score === 0) continue;
     scored.push({
       reference: entry.reference,