Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 53 additions & 4 deletions src/services/componentSearchIndex.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -236,10 +236,59 @@ describe("lexicalSearch", () => {
expect(results[0]?.digest).toBe("contiguous");
});

it("tokenizes snake_case queries so each segment matches independently", () => {
const index = buildSearchIndex(fixtures);
const results = lexicalSearch(index, "drop nulls");
expect(results[0]?.digest).toBe("b");
it("normalizes snake_case, kebab-case, and camelCase text", () => {
const index = buildSearchIndex([
makeSourced({
digest: "snake",
spec: {
name: "drop_nulls",
inputs: [],
outputs: [],
implementation: { container: { image: "x" } },
},
}),
makeSourced({
digest: "kebab",
spec: {
name: "train-model",
inputs: [],
outputs: [],
implementation: { container: { image: "x" } },
},
}),
makeSourced({
digest: "camel",
spec: {
name: "loadCSVFile",
inputs: [],
outputs: [],
implementation: { container: { image: "x" } },
},
}),
]);

expect(lexicalSearch(index, "drop nulls")[0]?.digest).toBe("snake");
expect(lexicalSearch(index, "train model")[0]?.digest).toBe("kebab");
expect(lexicalSearch(index, "load csv file")[0]?.digest).toBe("camel");
});

it("normalizes plurals and simple stemmed terms", () => {
const index = buildSearchIndex([
makeSourced({
digest: "normalize",
spec: {
name: "train_model",
description: "Train models on a dataset with labeled batches.",
inputs: [],
outputs: [],
implementation: { container: { image: "x" } },
},
}),
]);

expect(lexicalSearch(index, "training")[0]?.digest).toBe("normalize");
expect(lexicalSearch(index, "datasets")[0]?.digest).toBe("normalize");
expect(lexicalSearch(index, "batch")[0]?.digest).toBe("normalize");
});

it("ignores natural-language filler words that would otherwise swamp intent", () => {
Expand Down
91 changes: 78 additions & 13 deletions src/services/componentSearchIndex.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ export interface IndexEntry {
name: string;
/** Where this component came from. */
source: ComponentSearchSource;
/** Pre-lowercased searchable text, one per logical field. */
/** Normalized searchable text, one per logical field. */
searchable: Record<MatchField, string>;
}

Expand Down Expand Up @@ -113,6 +113,68 @@ function stringifySearchValue(value: unknown): string {
}
}

function splitIdentifierText(text: string): string {
return text
.replace(/([A-Z])([A-Z][a-z])/g, "$1 $2")
.replace(/([a-z0-9])([A-Z])/g, "$1 $2")
.replace(/[_-]+/g, " ");
Comment thread
Mbeaulne marked this conversation as resolved.
}

function removeSuffixAndCollapseDoubleFinal(
token: string,
suffixLength: number,
): string {
const stemmed = token.slice(0, -suffixLength);
if (stemmed.length < 3) return stemmed;

const last = stemmed.at(-1);
const previous = stemmed.at(-2);
return last && last === previous ? stemmed.slice(0, -1) : stemmed;
}

Comment thread
Mbeaulne marked this conversation as resolved.
function stemToken(token: string): string {
if (token.length <= 3) return token;
if (token.endsWith("ies") && token.length > 4) {
return `${token.slice(0, -3)}y`;
}
if (token.endsWith("ing") && token.length > 5) {
return removeSuffixAndCollapseDoubleFinal(token, 3);
}
if (token.endsWith("ed") && token.length > 4) {
return removeSuffixAndCollapseDoubleFinal(token, 2);
}
if (/(ches|shes|xes|zes|ses)$/.test(token) && token.length > 4) {
return token.slice(0, -2);
}
if (
token.endsWith("s") &&
!token.endsWith("ss") &&
!token.endsWith("is") &&
!token.endsWith("us") &&
token.length > 3
) {
return token.slice(0, -1);
}
Comment thread
Mbeaulne marked this conversation as resolved.
return token;
}

function normalizeSearchText(text: string): string {
const splitText = splitIdentifierText(text).toLowerCase();
const tokens = splitText.split(/[^a-z0-9]+/).filter(isNonEmptyString);
const expandedTokens: string[] = [];
const seen = new Set<string>();

for (const token of tokens) {
for (const variant of [token, stemToken(token)]) {
if (seen.has(variant)) continue;
seen.add(variant);
expandedTokens.push(variant);
}
}

return [text.toLowerCase(), splitText, expandedTokens.join(" ")].join(" ");
}

function extractAnnotationsText(
annotations: Record<string, unknown> | undefined,
): string {
Expand Down Expand Up @@ -243,14 +305,17 @@ export function buildSearchIndex(sourced: SourcedReference[]): IndexEntry[] {
name: metadata.name,
source,
searchable: {
name: metadata.name.toLowerCase(),
description: metadata.description.toLowerCase(),
io: metadata.ioText.toLowerCase(),
implementation: extractImplementationText(reference),
metadata: [metadata.metadataText, source.label, reference.published_by]
.filter(isNonEmptyString)
.join(" ")
.toLowerCase(),
name: normalizeSearchText(metadata.name),
description: normalizeSearchText(metadata.description),
io: normalizeSearchText(metadata.ioText),
implementation: normalizeSearchText(
extractImplementationText(reference),
),
metadata: normalizeSearchText(
[metadata.metadataText, source.label, reference.published_by]
.filter(isNonEmptyString)
.join(" "),
),
},
});
}
Expand Down Expand Up @@ -288,10 +353,9 @@ const QUERY_STOP_WORDS = new Set([
* nearly every component and drowning out the useful intent terms.
*/
function tokenize(text: string): string[] {
const rawTokens = text
.toLowerCase()
const rawTokens = normalizeSearchText(text)
Comment thread
Mbeaulne marked this conversation as resolved.
.split(/[^a-z0-9]+/)
.filter((t) => t.length > 0);
.filter(isNonEmptyString);

const tokens: string[] = [];
const seen = new Set<string>();
Expand Down Expand Up @@ -346,7 +410,8 @@ interface SearchOptions {
* "train test split" matching `train_test_split` strongly even though we
* tokenized.
*
* We deliberately do not normalize — raw scores are only used for ordering.
* Indexed text and query text are normalized before scoring; raw scores are
* only used for ordering.
*/
function scoreEntry(
entry: IndexEntry,
Expand Down
Loading