Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 105 additions & 0 deletions src/services/componentSearchIndex.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,111 @@ describe("lexicalSearch", () => {
expect(lexicalSearch(index, "batch")[0]?.digest).toBe("normalize");
});

it("boosts prefix matches for partial search terms", () => {
// Both components match "classif" in the same (name) field, so the only
// thing that can separate them is the prefix bonus. The substring candidate
// also sorts first alphabetically, so without the bonus it would win the
// tie-break — making this test fail if PREFIX_MATCH_BONUS_MULTIPLIER were 0.
const index = buildSearchIndex([
makeSourced({
digest: "prefix",
spec: {
name: "classify_rows", // "classify" is a true word-prefix of "classif"
inputs: [],
outputs: [],
implementation: { container: { image: "x" } },
},
}),
makeSourced({
digest: "substring",
spec: {
// "reclassify" contains "classif" mid-word (never as a prefix), and
// "alpha_…" sorts before "classify_rows".
name: "alpha_reclassify",
inputs: [],
outputs: [],
implementation: { container: { image: "x" } },
},
}),
]);

expect(lexicalSearch(index, "classif")[0]?.digest).toBe("prefix");
});

it("boosts rare tokens over common tokens", () => {
// Both candidates contain BOTH query tokens (so the all-tokens bonus applies
// equally) and differ only in which token sits in the high-weight name
// field. The filler components make "model" common and "xgboost" rare, so
// only the IDF weighting can make the rare-token-in-name candidate win — the
// test fails if rare-token weighting is removed (then it ties and the
// alphabetically-earlier "common-in-name" wins instead).
const filler = ["m1", "m2", "m3"].map((digest) =>
makeSourced({
digest,
spec: {
name: `${digest}_model`,
inputs: [],
outputs: [],
implementation: { container: { image: "x" } },
},
}),
);
const index = buildSearchIndex([
...filler,
makeSourced({
digest: "common-in-name",
spec: {
name: "train_model",
description: "Uses xgboost.",
inputs: [],
outputs: [],
implementation: { container: { image: "x" } },
},
}),
makeSourced({
digest: "rare-in-name",
spec: {
name: "xgboost_runner",
description: "Builds a model.",
inputs: [],
outputs: [],
implementation: { container: { image: "x" } },
},
}),
]);

expect(lexicalSearch(index, "model xgboost")[0]?.digest).toBe(
"rare-in-name",
);
});

it("boosts matches that include every query token across fields", () => {
const index = buildSearchIndex([
makeSourced({
digest: "partial",
spec: {
name: "a_train_task",
description: "Train something.",
inputs: [],
outputs: [],
implementation: { container: { image: "x" } },
},
}),
makeSourced({
digest: "complete",
spec: {
name: "z_train_task",
description: "Produces a model artifact.",
inputs: [],
outputs: [],
implementation: { container: { image: "x" } },
},
}),
]);

expect(lexicalSearch(index, "train model")[0]?.digest).toBe("complete");
});

it("expands domain-neutral synonyms", () => {
const index = buildSearchIndex([
makeSourced({
Expand Down
144 changes: 112 additions & 32 deletions src/services/componentSearchIndex.ts
Original file line number Diff line number Diff line change
Expand Up @@ -364,18 +364,28 @@ const QUERY_STOP_WORDS = new Set([
* prevents common tokens like "a"/"to" from matching nearly every component and
* drowning out the useful intent terms.
*/
function filterQueryTokens(rawTokens: string[]): string[] {
const tokens: string[] = [];
function uniqueTokens(tokens: string[]): string[] {
const unique: string[] = [];
const seen = new Set<string>();
for (const token of rawTokens) {
if (QUERY_STOP_WORDS.has(token)) continue;
if (!seen.has(token)) {
tokens.push(token);
seen.add(token);
}
for (const token of tokens) {
if (QUERY_STOP_WORDS.has(token) || seen.has(token)) continue;
seen.add(token);
unique.push(token);
}
return unique;
}

return tokens;
function tokenize(text: string): string[] {
return uniqueTokens(expandSynonymTokens(baseSearchTokens(text)));
}

function requiredQueryTokens(text: string): string[] {
const rawTokens = splitIdentifierText(text)
.toLowerCase()
.split(/[^a-z0-9]+/)
.filter(isNonEmptyString)
.map(stemToken);
return uniqueTokens(rawTokens);
}

/**
Expand All @@ -391,6 +401,17 @@ const FIELD_WEIGHTS: Record<MatchField, number> = {
metadata: 1,
};

const FIELD_PHRASE_BONUS: Record<MatchField, number> = {
name: 10,
description: 4,
io: 4,
implementation: 2,
metadata: 2,
};

const PREFIX_MATCH_BONUS_MULTIPLIER = 0.5;
const ALL_QUERY_TOKENS_BONUS = 6;

const SEARCH_FIELDS: MatchField[] = [
"name",
"description",
Expand All @@ -409,48 +430,104 @@ interface SearchOptions {
minLength?: number;
}

function searchableTokens(text: string): string[] {
return text.split(/[^a-z0-9]+/).filter(isNonEmptyString);
}

function entryMatchesToken(entry: IndexEntry, token: string): boolean {
return SEARCH_FIELDS.some((field) => entry.searchable[field].includes(token));
}

function buildRareTokenWeights(
index: IndexEntry[],
tokens: string[],
): Map<string, number> {
const weights = new Map<string, number>();
for (const token of tokens) {
const documentFrequency = index.filter((entry) =>
entryMatchesToken(entry, token),
).length;
const inverseFrequency = Math.log(
(index.length + 1) / (documentFrequency + 1),
);
weights.set(token, 1 + Math.max(0, inverseFrequency));
}
return weights;
}

/**
* Score one entry against the tokenized query. Returns 0 if no field matched.
*
* Scoring model:
* - Per query token: each field that contains the token contributes its weight.
* - Bonus: full multi-token query as a substring of the name (+10). Catches
* "train test split" matching `train_test_split` strongly even though we
* tokenized.
* - Word-boundary matches (a query token that begins an indexed token,
* including exact matches) get a small extra boost, useful for partial names.
* - Rare query tokens count more than tokens that match many components.
* - Contiguous multi-token phrase matches and all-token matches get bonuses.
*
* Indexed text and query text are normalized before scoring; raw scores are
* only used for ordering.
*/
function scoreEntry(
entry: IndexEntry,
tokens: string[],
phraseTokens: string[],
requiredTokens: string[],
tokenWeights: Map<string, number>,
): { score: number; matchedFields: MatchField[] } {
const matched = new Set<MatchField>();
let score = 0;

// A field's tokenization depends only on the field, not the query token, so
// split it once per entry (cached) rather than re-splitting inside the
// per-query-token loop — that re-split is hot on every keystroke.
const fieldTokenCache = new Map<MatchField, string[]>();
const fieldTokensFor = (field: MatchField): string[] => {
const cached = fieldTokenCache.get(field);
if (cached) return cached;
const fieldTokens = searchableTokens(entry.searchable[field]);
fieldTokenCache.set(field, fieldTokens);
return fieldTokens;
};

for (const token of tokens) {
const tokenWeight = tokenWeights.get(token) ?? 1;
for (const field of SEARCH_FIELDS) {
if (entry.searchable[field].includes(token)) {
score += FIELD_WEIGHTS[field];
matched.add(field);
const fieldText = entry.searchable[field];
if (!fieldText.includes(token)) continue;

const fieldWeight = FIELD_WEIGHTS[field];
score += fieldWeight * tokenWeight;
matched.add(field);

const hasPrefixMatch = fieldTokensFor(field).some((fieldToken) =>
fieldToken.startsWith(token),
Comment thread
Mbeaulne marked this conversation as resolved.
);
if (hasPrefixMatch) {
score += fieldWeight * PREFIX_MATCH_BONUS_MULTIPLIER * tokenWeight;
}
}
}

// Multi-token contiguous match in the name is a very strong signal. Both
// sides are normalized so the bonus also fires for snake_case names —
// query "train test split" should match `train_test_split`, not just
// names that happen to contain literal spaces.
if (phraseTokens.length > 1) {
const normalizedName = entry.searchable.name.replace(/[^a-z0-9]+/g, " ");
const normalizedQuery = phraseTokens.join(" ");
if (normalizedName.includes(normalizedQuery)) {
score += 10;
matched.add("name");
if (requiredTokens.length > 1) {
const normalizedQuery = requiredTokens.join(" ");
for (const field of SEARCH_FIELDS) {
const normalizedField = entry.searchable[field].replace(
/[^a-z0-9]+/g,
" ",
);
if (!normalizedField.includes(normalizedQuery)) continue;
score += FIELD_PHRASE_BONUS[field];
matched.add(field);
}
}

if (
requiredTokens.length > 1 &&
requiredTokens.every((token) => entryMatchesToken(entry, token))
) {
score += ALL_QUERY_TOKENS_BONUS;
}

return { score, matchedFields: [...matched] };
}

Expand All @@ -468,16 +545,19 @@ export function lexicalSearch(
const trimmed = query.trim().toLowerCase();
if (trimmed.length < minLength) return [];

const baseTokens = baseSearchTokens(trimmed);
const tokens = filterQueryTokens(expandSynonymTokens(baseTokens));
const tokens = tokenize(trimmed);
if (tokens.length === 0) return [];
const phraseTokens = baseTokens.filter(
(token) => !QUERY_STOP_WORDS.has(token),
);
const requiredTokens = requiredQueryTokens(trimmed);
const tokenWeights = buildRareTokenWeights(index, tokens);

const scored: Array<LexicalMatch & { score: number }> = [];
for (const entry of index) {
const { score, matchedFields } = scoreEntry(entry, tokens, phraseTokens);
const { score, matchedFields } = scoreEntry(
entry,
tokens,
requiredTokens,
tokenWeights,
);
if (score === 0) continue;
scored.push({
reference: entry.reference,
Expand Down
Loading