Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions src/services/componentSearchIndex.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,56 @@ describe("lexicalSearch", () => {
expect(lexicalSearch(index, "batch")[0]?.digest).toBe("normalize");
});

it("expands domain-neutral synonyms", () => {
const index = buildSearchIndex([
makeSourced({
digest: "storage",
spec: {
name: "upload_object",
description: "Upload files to a cloud storage bucket.",
inputs: [],
outputs: [],
implementation: { container: { image: "x" } },
},
}),
makeSourced({
digest: "train",
spec: {
name: "train_model",
description: "Train a model on tabular data.",
inputs: [],
outputs: [],
implementation: { container: { image: "x" } },
},
}),
makeSourced({
digest: "predict",
spec: {
name: "predict_labels",
description: "Predict labels for examples.",
inputs: [],
outputs: [],
implementation: { container: { image: "x" } },
},
}),
makeSourced({
digest: "table",
spec: {
name: "clean_table",
description: "Clean tabular dataframe rows.",
inputs: [],
outputs: [],
implementation: { container: { image: "x" } },
},
}),
]);

expect(lexicalSearch(index, "gcs")[0]?.digest).toBe("storage");
expect(lexicalSearch(index, "fit")[0]?.digest).toBe("train");
expect(lexicalSearch(index, "infer")[0]?.digest).toBe("predict");
expect(lexicalSearch(index, "df")[0]?.digest).toBe("table");
});

it("ignores natural-language filler words that would otherwise swamp intent", () => {
const index = buildSearchIndex([
makeSourced({
Expand Down
43 changes: 28 additions & 15 deletions src/services/componentSearchIndex.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import type { ComponentReference } from "@/utils/componentSpec";
import { getComponentName } from "@/utils/getComponentName";

import { expandSynonymTokens } from "./componentSearchSynonyms";

/** Which field of a component matched the query. Surfaced in the UI. */
export type MatchField =
| "name"
Expand Down Expand Up @@ -158,7 +160,7 @@ function stemToken(token: string): string {
return token;
}

function normalizeSearchText(text: string): string {
function baseSearchTokens(text: string): string[] {
const splitText = splitIdentifierText(text).toLowerCase();
const tokens = splitText.split(/[^a-z0-9]+/).filter(isNonEmptyString);
const expandedTokens: string[] = [];
Expand All @@ -172,7 +174,17 @@ function normalizeSearchText(text: string): string {
}
}

return [text.toLowerCase(), splitText, expandedTokens.join(" ")].join(" ");
return expandedTokens;
}

function normalizeSearchText(text: string): string {
const splitText = splitIdentifierText(text).toLowerCase();
// Synonym expansion happens on the query side only (see `tokenize`). Expanding
// the index too would make a query token set intersect a ballooned index token
// set, surfacing components that match neither the literal text nor the intent.
return [text.toLowerCase(), splitText, baseSearchTokens(text).join(" ")].join(
" ",
);
}

function extractAnnotationsText(
Expand Down Expand Up @@ -347,16 +359,12 @@ const QUERY_STOP_WORDS = new Set([
]);

/**
* Split a query into meaningful lowercase alphanumeric tokens. Natural-language
* searches often include filler words ("I want to upload a component to GCS").
* Dropping those words prevents common tokens like "a"/"to" from matching
* nearly every component and drowning out the useful intent terms.
* Drop filler words and de-duplicate query tokens. Natural-language searches
* often include filler ("I want to upload a component to GCS"); removing those
* prevents common tokens like "a"/"to" from matching nearly every component and
* drowning out the useful intent terms.
*/
function tokenize(text: string): string[] {
const rawTokens = normalizeSearchText(text)
.split(/[^a-z0-9]+/)
.filter(isNonEmptyString);

function filterQueryTokens(rawTokens: string[]): string[] {
const tokens: string[] = [];
const seen = new Set<string>();
for (const token of rawTokens) {
Expand Down Expand Up @@ -416,6 +424,7 @@ interface SearchOptions {
function scoreEntry(
entry: IndexEntry,
tokens: string[],
phraseTokens: string[],
): { score: number; matchedFields: MatchField[] } {
const matched = new Set<MatchField>();
let score = 0;
Expand All @@ -433,9 +442,9 @@ function scoreEntry(
// sides are normalized so the bonus also fires for snake_case names —
// query "train test split" should match `train_test_split`, not just
// names that happen to contain literal spaces.
if (tokens.length > 1) {
if (phraseTokens.length > 1) {
Comment thread
Mbeaulne marked this conversation as resolved.
const normalizedName = entry.searchable.name.replace(/[^a-z0-9]+/g, " ");
const normalizedQuery = tokens.join(" ");
const normalizedQuery = phraseTokens.join(" ");
if (normalizedName.includes(normalizedQuery)) {
score += 10;
matched.add("name");
Expand All @@ -459,12 +468,16 @@ export function lexicalSearch(
const trimmed = query.trim().toLowerCase();
if (trimmed.length < minLength) return [];

const tokens = tokenize(trimmed);
const baseTokens = baseSearchTokens(trimmed);
const tokens = filterQueryTokens(expandSynonymTokens(baseTokens));
if (tokens.length === 0) return [];
const phraseTokens = baseTokens.filter(
(token) => !QUERY_STOP_WORDS.has(token),
);

const scored: Array<LexicalMatch & { score: number }> = [];
for (const entry of index) {
const { score, matchedFields } = scoreEntry(entry, tokens);
const { score, matchedFields } = scoreEntry(entry, tokens, phraseTokens);
if (score === 0) continue;
scored.push({
reference: entry.reference,
Expand Down
41 changes: 41 additions & 0 deletions src/services/componentSearchSynonyms.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
const SYNONYM_GROUPS = [
["gcs", "storage", "bucket", "object storage", "cloud storage"],
["train", "fit", "training", "trainer"],
["predict", "infer", "inference", "score"],
["df", "dataframe", "data frame", "table"],
["csv", "comma separated", "tabular"],
["embed", "embedding", "vectorize", "vector"],
["llm", "language model", "chat model"],
] as const;

const SYNONYM_TOKENS_BY_TOKEN = new Map<string, string[]>();

for (const group of SYNONYM_GROUPS) {
// Only single-word aliases are usable as expansion keys: queries are matched
// token-by-token, so a multi-word phrase ("data frame") can never match as a
// unit, and fragmenting it would turn common words ("data", "model") into
// keys that cross-link unrelated components.
const singleWordTerms = group.filter((term) => /^[a-z0-9]+$/.test(term));

for (const token of singleWordTerms) {
SYNONYM_TOKENS_BY_TOKEN.set(token, singleWordTerms);
}
}

export function expandSynonymTokens(tokens: string[]): string[] {
const expanded: string[] = [];
const seen = new Set<string>();

for (const token of tokens) {
for (const variant of [
token,
...(SYNONYM_TOKENS_BY_TOKEN.get(token) ?? []),
]) {
if (seen.has(variant)) continue;
seen.add(variant);
expanded.push(variant);
}
}

return expanded;
}
Loading