Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions src/services/componentSearchIndex.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,73 @@ describe("lexicalSearch", () => {
expect(lexicalSearch(index, "df")[0]?.digest).toBe("table");
});

it("parses negative constraints and excludes matching components", () => {
const index = buildSearchIndex([
makeSourced({
digest: "local-upload",
spec: {
name: "upload_file",
description: "Upload a file to a local directory.",
inputs: [],
outputs: [],
implementation: { container: { image: "x" } },
},
}),
makeSourced({
digest: "gcs-upload",
spec: {
name: "upload_to_gcs",
description: "Upload a file to GCS.",
inputs: [],
outputs: [],
implementation: { container: { image: "x" } },
},
}),
]);

expect(
lexicalSearch(index, "I want to upload a file but not to GCS").map(
(result) => result.digest,
),
).toEqual(["local-upload"]);
expect(
lexicalSearch(index, "I want to upload a file excluding GCS").map(
(result) => result.digest,
),
).toEqual(["local-upload"]);
});

it("excludes only the literal negated term, not its synonyms", () => {
const index = buildSearchIndex([
makeSourced({
digest: "gcs-uploader",
spec: {
name: "upload_to_gcs",
description: "Upload to GCS.",
inputs: [],
outputs: [],
implementation: { container: { image: "x" } },
},
}),
makeSourced({
digest: "storage-uploader",
spec: {
name: "upload_to_storage",
description: "Upload to a storage bucket.",
inputs: [],
outputs: [],
implementation: { container: { image: "x" } },
},
}),
]);

// "gcs" is a synonym of "storage"/"bucket", but negation is literal: only
// the gcs component is dropped, the storage one survives.
const results = lexicalSearch(index, "upload not gcs").map((r) => r.digest);
expect(results).toContain("storage-uploader");
expect(results).not.toContain("gcs-uploader");
});

it("ignores natural-language filler words that would otherwise swamp intent", () => {
const index = buildSearchIndex([
makeSourced({
Expand Down
56 changes: 54 additions & 2 deletions src/services/componentSearchIndex.ts
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,7 @@ const QUERY_STOP_WORDS = new Set([
"an",
"and",
"are",
"but",
"component",
"for",
"from",
Expand All @@ -348,6 +349,7 @@ const QUERY_STOP_WORDS = new Set([
"into",
"me",
"my",
"no",
"of",
"on",
"please",
Expand Down Expand Up @@ -388,6 +390,34 @@ function requiredQueryTokens(text: string): string[] {
return uniqueTokens(rawTokens);
}

interface ParsedSearchQuery {
positiveText: string;
negativeText: string;
}

// Bind a negation to its term(s): capture consecutive words but stop at a
// conjunction/filler word (and punctuation), so "not gcs and also train"
// excludes only "gcs" and leaves "and also train" for positive matching
// instead of swallowing the whole tail.
const NEGATIVE_CONSTRAINT_PATTERN =
/\b(?:without|excluding|exclude|not|no)\b\s+(?:(?:to|use|using)\s+)?([a-z0-9][a-z0-9-]*(?:\s+(?!(?:and|or|but|then|also|plus|with)\b)[a-z0-9][a-z0-9-]*)*)/gi;

function parseSearchQuery(text: string): ParsedSearchQuery {
const negativeParts: string[] = [];
const positiveText = text.replace(
NEGATIVE_CONSTRAINT_PATTERN,
(_match, negativePart: string) => {
negativeParts.push(negativePart);
return " ";
},
);

return {
positiveText,
negativeText: negativeParts.join(" "),
};
}

/**
* Per-field weights. Name matches are by far the most signal: `train` in the
* name means the component is *about* training. The same word in implementation
Expand Down Expand Up @@ -526,6 +556,7 @@ function scoreEntry(
entry: IndexEntry,
tokens: string[],
requiredTokens: string[],
negativeTokens: string[],
tokenWeights: Map<string, number>,
): { score: number; matchedFields: MatchField[] } {
const matched = new Set<MatchField>();
Expand All @@ -543,6 +574,17 @@ function scoreEntry(
return fieldTokens;
};

// Hard exclusion uses a whole-token match (not substring): a zero-score
// filter removes the entire component, so a short negated token must not
// knock one out by incidentally appearing inside an unrelated word.
if (
negativeTokens.some((token) =>
SEARCH_FIELDS.some((field) => fieldTokensFor(field).includes(token)),
)
) {
return { score: 0, matchedFields: [] };
}

for (const token of tokens) {
const tokenWeight = tokenWeights.get(token) ?? 1;
for (const field of SEARCH_FIELDS) {
Expand Down Expand Up @@ -609,9 +651,18 @@ export function lexicalSearch(
const trimmed = query.trim().toLowerCase();
if (trimmed.length < minLength) return [];

const tokens = tokenize(trimmed);
const parsedQuery = parseSearchQuery(trimmed);
const tokens = tokenize(parsedQuery.positiveText);
Comment thread
Mbeaulne marked this conversation as resolved.
// An all-negative query (e.g. "not gcs") has no positive intent to rank.
// Exclusion only filters positive matches — it doesn't enumerate the whole
// library — so there's nothing to return.
if (tokens.length === 0) return [];
const requiredTokens = requiredQueryTokens(trimmed);
const requiredTokens = requiredQueryTokens(parsedQuery.positiveText);
// Literal exclusion: don't synonym-expand the negated term, so "not gcs"
// removes gcs components without also dropping storage/bucket/etc.
const negativeTokens = uniqueTokens(
baseSearchTokens(parsedQuery.negativeText),
);
const tokenWeights = buildRareTokenWeights(index, tokens);

const scored: Array<LexicalMatch & { score: number }> = [];
Expand All @@ -620,6 +671,7 @@ export function lexicalSearch(
entry,
tokens,
requiredTokens,
negativeTokens,
tokenWeights,
);
if (score === 0) continue;
Expand Down
Loading