From 554c927f9bd5f0a68888ade4c819a82035b9bfb7 Mon Sep 17 00:00:00 2001 From: mbeaulne Date: Thu, 18 Jun 2026 13:34:55 -0400 Subject: [PATCH] =?UTF-8?q?Parse=20negative=20constraints:=20=E2=80=9Cwith?= =?UTF-8?q?out=E2=80=9D,=20=E2=80=9Cnot=E2=80=9D,=20=E2=80=9Cno=E2=80=9D,?= =?UTF-8?q?=20=E2=80=9Cexclude=E2=80=9D.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/services/componentSearchIndex.test.ts | 67 +++++++++++++++++++++++ src/services/componentSearchIndex.ts | 56 ++++++++++++++++++- 2 files changed, 121 insertions(+), 2 deletions(-) diff --git a/src/services/componentSearchIndex.test.ts b/src/services/componentSearchIndex.test.ts index 18491bc6e..219dbe169 100644 --- a/src/services/componentSearchIndex.test.ts +++ b/src/services/componentSearchIndex.test.ts @@ -498,6 +498,73 @@ describe("lexicalSearch", () => { expect(lexicalSearch(index, "df")[0]?.digest).toBe("table"); }); + it("parses negative constraints and excludes matching components", () => { + const index = buildSearchIndex([ + makeSourced({ + digest: "local-upload", + spec: { + name: "upload_file", + description: "Upload a file to a local directory.", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "gcs-upload", + spec: { + name: "upload_to_gcs", + description: "Upload a file to GCS.", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + ]); + + expect( + lexicalSearch(index, "I want to upload a file but not to GCS").map( + (result) => result.digest, + ), + ).toEqual(["local-upload"]); + expect( + lexicalSearch(index, "I want to upload a file excluding GCS").map( + (result) => result.digest, + ), + ).toEqual(["local-upload"]); + }); + + it("excludes only the literal negated term, not its synonyms", () => { + const index = buildSearchIndex([ + makeSourced({ + digest: "gcs-uploader", + spec: { + name: "upload_to_gcs", + description: "Upload to GCS.", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "storage-uploader", + spec: { + name: "upload_to_storage", + description: "Upload to a storage bucket.", + inputs: [], + outputs: [], + implementation: { container: { image: "x" } }, + }, + }), + ]); + + // "gcs" is a synonym of "storage"/"bucket", but negation is literal: only + // the gcs component is dropped, the storage one survives. + const results = lexicalSearch(index, "upload not gcs").map((r) => r.digest); + expect(results).toContain("storage-uploader"); + expect(results).not.toContain("gcs-uploader"); + }); + it("ignores natural-language filler words that would otherwise swamp intent", () => { const index = buildSearchIndex([ makeSourced({ diff --git a/src/services/componentSearchIndex.ts b/src/services/componentSearchIndex.ts index 002f22c6c..fa5be28d4 100644 --- a/src/services/componentSearchIndex.ts +++ b/src/services/componentSearchIndex.ts @@ -340,6 +340,7 @@ const QUERY_STOP_WORDS = new Set([ "an", "and", "are", + "but", "component", "for", "from", @@ -348,6 +349,7 @@ const QUERY_STOP_WORDS = new Set([ "into", "me", "my", + "no", "of", "on", "please", @@ -388,6 +390,34 @@ function requiredQueryTokens(text: string): string[] { return uniqueTokens(rawTokens); } +interface ParsedSearchQuery { + positiveText: string; + negativeText: string; +} + +// Bind a negation to its term(s): capture consecutive words but stop at a +// conjunction/filler word (and punctuation), so "not gcs and also train" +// excludes only "gcs" and leaves "and also train" for positive matching +// instead of swallowing the whole tail. +const NEGATIVE_CONSTRAINT_PATTERN = + /\b(?:without|excluding|exclude|not|no)\b\s+(?:(?:to|use|using)\s+)?([a-z0-9][a-z0-9-]*(?:\s+(?!(?:and|or|but|then|also|plus|with)\b)[a-z0-9][a-z0-9-]*)*)/gi; + +function parseSearchQuery(text: string): ParsedSearchQuery { + const negativeParts: string[] = []; + const positiveText = text.replace( + NEGATIVE_CONSTRAINT_PATTERN, + (_match, negativePart: string) => { + negativeParts.push(negativePart); + return " "; + }, + ); + + return { + positiveText, + negativeText: negativeParts.join(" "), + }; +} + /** * Per-field weights. Name matches are by far the most signal: `train` in the * name means the component is *about* training. The same word in implementation @@ -526,6 +556,7 @@ function scoreEntry( entry: IndexEntry, tokens: string[], requiredTokens: string[], + negativeTokens: string[], tokenWeights: Map, ): { score: number; matchedFields: MatchField[] } { const matched = new Set(); @@ -543,6 +574,17 @@ function scoreEntry( return fieldTokens; }; + // Hard exclusion uses a whole-token match (not substring): a zero-score + // filter removes the entire component, so a short negated token must not + // knock one out by incidentally appearing inside an unrelated word. + if ( + negativeTokens.some((token) => + SEARCH_FIELDS.some((field) => fieldTokensFor(field).includes(token)), + ) + ) { + return { score: 0, matchedFields: [] }; + } + for (const token of tokens) { const tokenWeight = tokenWeights.get(token) ?? 1; for (const field of SEARCH_FIELDS) { @@ -609,9 +651,18 @@ export function lexicalSearch( const trimmed = query.trim().toLowerCase(); if (trimmed.length < minLength) return []; - const tokens = tokenize(trimmed); + const parsedQuery = parseSearchQuery(trimmed); + const tokens = tokenize(parsedQuery.positiveText); + // An all-negative query (e.g. "not gcs") has no positive intent to rank. + // Exclusion only filters positive matches — it doesn't enumerate the whole + // library — so there's nothing to return. if (tokens.length === 0) return []; - const requiredTokens = requiredQueryTokens(trimmed); + const requiredTokens = requiredQueryTokens(parsedQuery.positiveText); + // Literal exclusion: don't synonym-expand the negated term, so "not gcs" + // removes gcs components without also dropping storage/bucket/etc. + const negativeTokens = uniqueTokens( + baseSearchTokens(parsedQuery.negativeText), + ); const tokenWeights = buildRareTokenWeights(index, tokens); const scored: Array = []; @@ -620,6 +671,7 @@ export function lexicalSearch( entry, tokens, requiredTokens, + negativeTokens, tokenWeights, ); if (score === 0) continue;